@rianusr
2019-08-13T10:07:11.000000Z
字数 1161
阅读 3346
06-机器学习
import numpy as np
import pandas as pd
def information_value(target,feature):
"""
计算变量的信息值
:param target:ndarray,真实值,1=正例,0=负例
:param feature:ndarray,离散变量
:return:
"""
iv_table=pd.DataFrame({"feature":feature,"y":target})
tot_good=np.sum(target)
tot_bad=len(target)-tot_good
iv_table=iv_table.groupby("feature").agg({
"y":{
"bad_count":lambda x:len(x)-np.sum(x),
"good_count":np.sum,
}
})["y"]
iv_table["bad_percent"]=iv_table["bad_count"]/tot_bad
iv_table["good_percent"]=iv_table["good_count"]/tot_good
iv_table["woe"]=np.log(iv_table["good_percent"]/iv_table["bad_percent"])
iv_table["iv"]=(iv_table["good_percent"]-iv_table["bad_percent"])*iv_table["woe"]
iv_value=np.sum(iv_table["iv"])
return iv_value,iv_table[["bad_count","bad_percent","good_count","good_percent","woe","iv"]]
titanic=pd.read_csv(path) #读取数据
titanic.head()
feature=titanic.Pclass
target=titanic.Survived
iv_value,iv_table=information_value(target,feature)
print(iv_table)
print("information_value:",iv_value)