【发布时间】:2021-08-17 13:20:59
【问题描述】:
我想为目标变量连续的数据集设置修改后的 WOE 和 IV 摘要(受此网站启发:https://www.listendata.com/2019/08/WOE-IV-Continuous-Dependent.html#comment-form)。代码是:
class IV_Calc:
def __init__(self, df, feature, target):
self.feature = feature
self.target = target
self.data_head = df.head()
def count_values(self):
data = pd.DataFrame()
data['Count'] = df[self.feature].value_counts()
data['Sum Y'] = df.groupby([self.feature])[self.target].sum()
data = data.sort_values(by=["Count"], ascending=False)
return data
def distribution(self):
data = self.count_values()
data['% Observations'] = data['Count'] / data
data['% Y'] = data['Sum Y'] / data['Sum Y'].sum()
return data.iloc[:,-2:]
def woe(self):
data = self.distribution()
data['WOE'] = np.log(data['% Y'] / data['% Observations'])
data.replace({"WOE": {np.inf: 0, -np.inf: 0}}) # If no instances are bad, this will replace values of infinity with 0
data = data.sort_values(by=["WOE"], ascending=False)
return data.iloc[:,-1]
def IV_per_cat(self):
data = self.distribution()
data['WOE'] = self.woe()
data["IV"] = data["WOE"]*(data['% Y'] - data['% Observations'])
data = data.sort_values(by=["IV"], ascending=False)
return data.iloc[:,-1]
def full_summary(self):
data = self.count_values()
data['% Observations'] = data['Count'] / data
data['% Y'] = data['Sum Y'] / data['Sum Y'].sum()
data['WOE'] = self.woe()
data["IV"] = self.IV_per_cat()
data = data.sort_values(by=["Count"], ascending=False)
return data
def final_assessment(self):
data = self.full_summary()
iv = data["IV"].sum() # final IV value
if iv < .02:
print("The variable " + self.feature + " is not predictive with an IV of: {}".format(iv))
elif iv < .1:
print("The variable " + self.feature + " is weakly predictive with an IV of:{}".format(iv))
elif iv < .3:
print("The variable " + self.feature + " is moderately predictive with an IV of:{}".format(iv))
else :
print("The variable " + self.feature + " is highly predictive with an IV of: {}".format(iv))
return iv
跑步
woe_test = IV_Calc(df=train, feature="host_response_time", target="price")
woe_test.full_summary()
虽然我在__init__ 部分定义了df,但第一个函数出现错误“名称'df' 未定义”。我在监督什么?我确信这与我使用类有关,但我在面向对象编程方面经验不足,无法找出它是什么。
【问题讨论】: