[关闭]
@zsh-o 2018-08-06T22:32:56.000000Z 字数 4094 阅读 1237

9 - EM - GMM - 代码

《统计学习方法》


该部分以代码为主,EM以前有写过,原理部分请参照Expectation Maximisation (EM)

  1. %matplotlib inline
  2. import numpy as np
  3. from matplotlib import pyplot as plt
  4. epsilon = 1e-8

EM推导完之后的形式还是比较简单的,现以GMM为例

E-step求出现有观测数据下每个观测数据隐变量的最优的分布函数

M-step在上面求得每一个观测数据的的情况下,求当前最优的参数

最后不断迭代上述过程即可

  1. from sklearn.datasets import make_blobs
  1. ## 生成数据
  2. X, Y = make_blobs(n_samples=1000,n_features=2, centers=3)
  3. plt.scatter(X[:, 0], X[:, 1], marker='.', c=Y, s=25, cmap='Dark2')
<matplotlib.collections.PathCollection at 0x1d1a6eda630>

output_3_1.png-33.1kB

  1. class GMM(object):
  2. def __init__(self, K, max_iter=500, eps=1e-5):
  3. self.K = K
  4. self.max_iter = max_iter
  5. self.eps = eps
  6. def gaussian(self, X, mu, sigma):
  7. return 1. / (np.power(2*np.pi, self.D/2.)
  8. * np.sqrt(np.linalg.det(sigma))) * np.exp(-0.5 * np.sum(np.dot(X - mu, np.linalg.pinv(sigma))
  9. * (X - mu), axis=1))
  10. def E_step(self):
  11. self.Q = self.pdf * self.Pi
  12. self.Q = self.Q / np.sum(self.Q, axis=1, keepdims=True)
  13. def M_step(self):
  14. for k in range(self.K):
  15. t = np.sum(self.Q[:, k])
  16. self.Pi[k] = t / self.N
  17. self.Mu[k] = np.sum(self.X * self.Q[:, k][:, np.newaxis], axis=0) / t
  18. self.Sigma[k] = np.dot((self.Q[:, k][:, np.newaxis] * (self.X - self.Mu[k])).T, self.X - self.Mu[k]) / t
  19. def log_likehood(self):
  20. return np.sum(np.log(np.sum(self.pdf * self.Pi, axis=1)))
  21. def fit(self, X):
  22. self.X = X
  23. self.N, self.D = X.shape
  24. self.Q = np.zeros((self.N, self.K))
  25. self.Pi = np.zeros(self.K) + 1./self.K
  26. self.Mu = np.random.permutation(self.X)[:self.K] ## 随机选择K个当作初始均值
  27. self.Sigma = np.concatenate([np.cov(self.X.T)[np.newaxis,:] for _ in range(self.K)])
  28. self.pdf = np.zeros((self.N, self.K))
  29. old_loglikehood = np.inf
  30. for _iter in range(self.max_iter):
  31. for k in range(self.K):
  32. self.pdf[:, k] = self.gaussian(self.X, self.Mu[k], self.Sigma[k])
  33. self.E_step()
  34. self.M_step()
  35. new_llh = self.log_likehood()
  36. if np.abs(old_loglikehood - new_llh) < self.eps:
  37. print('End with Eps at iter %d' % _iter)
  38. return
  39. old_loglikehood = new_llh
  40. print('End with max iter %d' % self.max_iter)
  41. def p(self, x):
  42. if x.ndim < 2:
  43. x = x[np.newaxis, :]
  44. N = x.shape[0]
  45. t = np.zeros((N, self.K))
  46. for k in range(self.K):
  47. t[:, k] = self.Pi[k] * self.gaussian(x, self.Mu[k], self.Sigma[k])
  48. return np.sum(t, axis=1)
  49. def predict(self, x):
  50. if x.ndim < 2:
  51. x = x[np.newaxis, :]
  52. N = x.shape[0]
  53. t = np.zeros((N, self.K))
  54. for k in range(self.K):
  55. t[:, k] = self.Pi[k] * self.gaussian(x, self.Mu[k], self.Sigma[k])
  56. return np.argmax(t, axis=1), t / np.sum(t, axis=1)
  1. gmm = GMM(K=3)
  2. gmm.fit(X)
End with Eps at iter 75
  1. def plot_decision_boundary(model, resolution=100, colors=('b', 'k', 'r'), figsize=(14,6)):
  2. plt.figure(figsize=figsize)
  3. xrange = np.linspace(model.X[:,0].min(), model.X[:,0].max(), resolution)
  4. yrange = np.linspace(model.X[:,1].min(), model.X[:,1].max(), resolution)
  5. grid = [[model.p(np.array([xr, yr])) for yr in yrange] for xr in xrange]
  6. grid = np.array(grid).reshape(len(xrange), len(yrange))
  7. plt.scatter(model.X[:, 0], model.X[:, 1], marker='.', c=Y-1, s=25, cmap='Dark2')
  8. plt.contour(xrange, yrange, grid.T, linewidths=(1,),
  9. linestyles=('-',), colors=colors[1])
  1. gmm.Y = Y
  1. plot_decision_boundary(gmm)

output_8_0.png-83.2kB

  1. from sklearn.datasets import make_moons
  2. from sklearn.datasets import make_circles
  3. from sklearn.preprocessing import StandardScaler
  1. X, Y = make_moons(n_samples=500, shuffle=True, noise=0.1)
  1. gmm = GMM(K=2)
  2. gmm.fit(X)
End with Eps at iter 116
  1. gmm.Y = Y
  2. plot_decision_boundary(gmm)

output_12_0.png-90.7kB


最后说一点,GMM还是有很多地方可以向下做的,例如不用指定聚类数量的无参模型(狄利克雷过程高斯混合模型),另一个是,现有高斯混合模型只能聚类类高斯分布的数据,而其他形状的数据,例如前面SVM节里面的环形和半月型数据就不再适用,但有模型叫结构化的变分自编码,其用多层NN把数据映射到隐状态空间(变分自编码),然后在隐状态空间用高斯混合模型进行聚类,用以发现隐藏在数据中的本质属性,因此可以适用于任意形状的数据

Reference

添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注