@zsh-o
2018-07-09T08:51:38.000000Z
字数 6670
阅读 1152
《统计学习方法》
%matplotlib inlineimport numpy as npfrom matplotlib import pyplot as pltepsilon = 1e-5np.seterr(divide='ignore',invalid='ignore')
{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}
import pydotfrom IPython.display import Image, displaydef viewPydot(pdot):plt = Image(pdot.create_png())display(plt)
## 给定离散的概率,求熵,e为底def H(P):P = P + epsilonreturn -(np.sum(P * np.log2(P)))
def information_gain(X, Y):N, M = X.shapeK = np.max(Y) + 1SX = np.max(X, axis=0) + 1NY = np.zeros(K)for i in range(K):NY[i] = np.count_nonzero(Y==i)NX = []for m in range(M):NX_i = np.zeros((SX[m],1))for i in range(SX[m]):NX_i[i,0] = np.count_nonzero(X[:, m]==i)NX.append(NX_i)HX = H(NY / N)NY_A = []for i in range(M):NY_Ai = np.zeros((SX[i], K))for j in range(SX[i]):t = Y[X[:, i]==j]for k in range(K):NY_Ai[j, k] = np.count_nonzero(t==k)NY_A.append(NY_Ai)PY_A = []PX = []for m in range(M):PY_A.append(NY_A[m] / NX[m]) ## NY_A[m][i, k] / NX[m][i] ## p(Y = y_k | X^m = i)PX.append(NX[m] / N)HY_X = []for m in range(M):HY_Xi = np.zeros(SX[m])for i in range(SX[m]):HY_Xi[i] = H(PY_A[m][i, :])HY_X.append(HY_Xi)HY_A = np.zeros(M)for m in range(M):HY_A[m] = np.sum(HY_X[m] * PX[m].reshape(-1)) ## numpy由于有Broadcasting存在,所以写程序的时候一定要注意维度要匹配 https://docs.scipy.org/doc/numpy-1.13.0/user/basics.broadcasting.htmlHA_X = np.zeros(M)for m in range(M):HA_X[m] = H(NX[m] / N)gain = HX - HY_Areturn gain, gain / HA_X
X = np.array([[0, 0, 0, 0, 0],[0, 0, 0, 1, 0],[0, 1, 0, 1, 1],[0, 1, 1, 0, 1],[0, 0, 0, 0, 0],[1, 0, 0, 0, 0],[1, 0, 0, 1, 0],[1, 1, 1, 1, 1],[1, 0, 1, 2, 1],[1, 0, 1, 2, 1],[2, 0, 1, 2, 1],[2, 0, 1, 1, 1],[2, 1, 0, 1, 1],[2, 1, 0, 2, 1],[2, 0, 0, 1, 0],])Y = X[:, -1]Y = Y.reshape((-1,1))X = X[:, :-1]
information_gain(X, Y)
(array([0.08300555, 0.32359689, 0.41990845, 0.29479317]),
array([0.05237053, 0.35239124, 0.43247518, 0.1926588 ]))
class TreeNode(object):def __init__(self, prop=None, Nprop=None, label=None):self.children = dict()self.prop = propself.Nprop = Npropself.label = labelself.leaf = False
def ID3(X, Y, threshold):N, M = X.shapeK = np.max(Y) + 1SX = np.max(X, axis=0) + 1def build_tree(cX, cY):NcY = np.zeros(K)unique_Y = np.unique(cY)if len(unique_Y)==1: ## 属于同一类t = TreeNode()t.label = unique_Y[0]t.leaf = Truereturn tfor i in range(K):NcY[i] = np.count_nonzero(cY==i)gain, ratio = information_gain(cX, cY)iP = np.argmax(gain)max_gain = gain[iP]t = TreeNode(prop=iP, Nprop=SX[iP], label = np.argmax(NcY))if max_gain < threshold:t.leaf = Truereturn tfor i in range(SX[iP]):cindex = (cX[:,iP] == i)t.children[i] = build_tree(cX[cindex], cY[cindex])return treturn build_tree(X, Y)
root = ID3(X, Y, 0.)
dot = pydot.Dot()global levellevel = 1def create_dot(p):global levelp_name = "%d # %s, %s, %s" % (level, str(p.prop), str(p.Nprop), str(p.label))dot.add_node(pydot.Node(name=p_name))if p.prop == None:returnfor i in range(p.Nprop):level = level + 1c = p.children[i]c_name = "%d # %s, %s, %s" % (level, str(c.prop), str(c.Nprop), str(c.label))dot.add_edge(pydot.Edge(dst=c_name, src=p_name))create_dot(c)level = level -1
create_dot(root)viewPydot(dot)

前序遍历计算每个节点的H,后续遍历对有叶节点进行剪枝
class TreeNode(object):def __init__(self, prop=None, Nprop=None, label=None):self.children = dict()self.prop = propself.Nprop = Npropself.label = labelself.leaf = Falseself.H = Nonedef ID3(X, Y, threshold):N, M = X.shapeK = np.max(Y) + 1SX = np.max(X, axis=0) + 1global visitedvisited = np.zeros(M, dtype=np.bool)def build_tree(cX, cY):global visitedNcY = np.zeros(K)unique_Y = np.unique(cY)if len(unique_Y)==1: ## 属于同一类t = TreeNode()t.label = unique_Y[0]t.leaf = Truet.H = 0.return tfor i in range(K):NcY[i] = np.count_nonzero(cY==i)gain, ratio = information_gain(cX, cY)gain[visited] = - np.infiP = np.argmax(gain)if len(np.unique(cX[:,iP]))==1:t1 = TreeNode(prop=None, Nprop=None, label=np.argmax(NcY))t1.H = H(NcY / len(cY))t1.leaf = Truereturn t1max_gain = gain[iP]t = TreeNode(prop=iP, Nprop=SX[iP], label = np.argmax(NcY))t.H = H(NcY / len(cY))if max_gain < threshold:t.leaf = Truereturn tvisited[iP] = Truefor i in range(SX[iP]):cindex = (cX[:,iP] == i)t.children[i] = build_tree(cX[cindex], cY[cindex])visited[iP] = Falsereturn treturn build_tree(X, Y)
root = ID3(X, Y, 0.)
dot = pydot.Dot()global levellevel = 1def create_dot(p):global levelp_name = "%d # %s, %s, %s, %.3f" % (level, str(p.prop), str(p.Nprop), str(p.label), p.H)dot.add_node(pydot.Node(name=p_name))if p.leaf is True:returnfor i in range(p.Nprop):level = level + 1c = p.children[i]c_name = "%d # %s, %s, %s, %.3f" % (level, str(c.prop), str(c.Nprop), str(c.label), c.H)dot.add_edge(pydot.Edge(dst=c_name, src=p_name))create_dot(c)level = level -1create_dot(root)viewPydot(dot)

Watermelon = np.array([[0, 0, 0, 0, 0, 0, 0],[1, 0, 1, 0, 0, 0, 0],[1, 0, 0, 0, 0, 0, 0],[0, 0, 1, 0, 0, 0, 0],[2, 0, 0, 0, 0, 0, 0],[0, 1, 0, 0, 1, 1, 0],[1, 1, 0, 1, 1, 1, 0],[1, 1, 0, 0, 1, 0, 0],[1, 1, 1, 1, 1, 0, 1],[0, 2, 2, 0, 2, 1, 1],[2, 2, 2, 2, 2, 0, 1],[2, 0, 0, 2, 2, 1, 1],[0, 1, 0, 1, 0, 0, 1],[2, 1, 1, 1, 0, 0, 1],[1, 1, 0, 0, 1, 1, 1],[2, 0, 0, 2, 2, 0, 1],[0, 0, 1, 1, 1, 0, 1],])Y = Watermelon[:, -1]Y = Y.reshape((-1,1))X = Watermelon[:, :-1]
root = ID3(X, Y, 0.)
dot = pydot.Dot()create_dot(root)viewPydot(dot)

不是leaf并且子节点的leaf==true
def pruning(root, threshold):if root.leaf is True:return;print(root.H, root.leaf)for p in root.children.values():pruning(p, threshold)if root.children[0].leaf is True:t = 0.l = len(root.children)for p in root.children.values():t = t + p.Hif t - root.H < (l-1) * threshold:for i in range(l):root.children[i] = Noneroot.leaf = True
pruning(root, .0)
0.9974937421855691 False
0.764200977141265 False
0.7219256790976065 False
dot = pydot.Dot()create_dot(root)viewPydot(dot)

需要在当前节点对应的数据集下计算所有属性和属性值对应的Gini指数,对最大Gini指数的进行划分,小于阈值跳出
class TreeNode(object):def __init__(self, prop, value, label):self.prop = propself.value = valueself.label = labelself.lc = Noneself.rc = Noneself.leaf = Falseself.N_leaf = 0self.H = 0.self.g = 0.
def CART_C(X, Y, threshold, purning_thre):N, M = X.shapeSX = np.max(X, axis=0) + 1K = np.max(Y) + 1global visitedvisited = []for m in range(M):visited.append(np.zeros(SX[m], dtype=np.bool))def build_tree(cX, cY):global visitedPX_Y = []for m in range(M):PXi = np.zeros((SX[m], K))for i in range(SX[m]):index = (cX[:, m] == i)iY = cY[index]l = len(iY)for k in range(K):PXi[i, k] = len(np.where(iY==k)[0]) / (l+epsilon)if l == 0:PXi[i, k] = 1PX_Y.append(PXi)ginis = []mginis = np.zeros(M)mindex = np.zeros(M, dtype=np.int)for m in range(M):PXi = PX_Y[m]gs = 1 - np.sum(np.power(PXi, 2), axis=1)gs[visited[m]] = -np.infmm = np.argmax(gs)mindex[m] = mmmginis[m] = gs[mm]mpi = np.argmax(mginis)mpii = mindex[mpi]mmg = mginis[mpi]t = TreeNode(prop=mpi, value=mpii, label=np.argmax(PX_Y[mpi][mpii,:]))t.H = H(PX_Y[mpi][mpii,:])if mmg <= (threshold+epsilon):t.leaf = Truet.N_leaf = 1t.g = 0.return tt.leaf = Falsevisited[mpi][mpii] = Trueindex = (cX[:, mpi] == mpii)t.lc = build_tree(cX[index], cY[index])t.rc = build_tree(cX[np.logical_not(index)], cY[np.logical_not(index)])t.N_leaf = t.lc.N_leaf + t.rc.N_leafvisited[mpi][mpii] = Falsereturn troot = build_tree(X, Y)return root
root = CART_C(X, Y, 0.45, 0.1)
dot = pydot.Dot()global levellevel = 1def create_dot(p):global levelp_name = "%s # %s, %s, %s, %.3f" % (p.__str__(), str(p.prop), str(p.value), str(p.label), p.H)dot.add_node(pydot.Node(name=p_name))if p.leaf is True:returnfor c in [p.lc, p.rc]:level = level + 1c_name = "%s # %s, %s, %s, %.3f" % (c.__str__(), str(c.prop), str(c.value), str(c.label), c.H)dot.add_edge(pydot.Edge(dst=c_name, src=p_name))create_dot(c)level = level -1create_dot(root)viewPydot(dot)
