作者:郭渊博 刘越 牛哲
self.top_freq_word = self.freq_dist.most_common(300)
for word, freq in self.top_freq_word:
for x, y in self.train_set:
if word in x:
if self.df.has_key(word):
self.df[word] += 1
self.df[word] = 1
最后,对每个词计算 ,并将结果加到特征集中:
for word, freq in self.top_freq_word:
self.labeled_words = {}
for x, y in self.train_set:
idx = self.label_set.index(y)
if self.labeled_words.has_key(idx):
self.labeled_words[idx] |= set(x)
self.labeled_words[idx] = set(x)
self.labeled_words_clean = {}
for k in self.labeled_words:
self.labeled_words_clean[k] = set(self.labeled_words[k])
for k1 in self.labeled_words_clean:
for k2 in self.labeled_words:
if k1 != k2:
self.labeled_words_clean[k1] -= self.labeled_words[k2]
for k in self.labeled_words_clean:
cnt = 0
for word in doc:
if word in self.labeled_words_clean[k]:
cnt += 1
我们通过Back Propagation的方法,将这个差值的影响从输出层一层一层地向后传,从而将神经网络的所有层进行调参。
由于沿着梯度方向函数下降最快,所以每次训练后经过Back Propagation得到的与正是相应变量需要改变的值,根据以下方式对变量进行更新:
import numpy as np
import random
def sigmoid(z):
return 1.0/(1.0+np.exp(-z))
def sigmoid_prime(z):
return sigmoid(z)*(1-sigmoid(z))
class Network(object):
def __init__(self, layers):
self.layers = layers
self.weights = [np.random.randn(x, y) for x, y in zip(layers[:-1], layers[1:])]
self.biases = [np.random.randn(1, y) for y in layers[1:]]
def feedforward(self, activation):
for weight, bias in zip(self.weights, self.biases):
activation = sigmoid(np.dot(activation, weight)+bias)
return activation
def vectorized_result(self, y):
e = np.zeros((1, self.layers[-1]))
e[0][y] = 1.0
return e
def train(self, epochs, eta, batch_size, training_data, test_data=None):
training_data = [(np.reshape(x, (1, len(x))), self.vectorized_result(y)) for x, y in training_data]
test_data = [(np.reshape(x, (1, len(x))), y) for x, y in test_data]
training_data_size = len(training_data)
if test_data:
test_data_size = len(test_data)
for i in xrange(0, epochs):
batches = [training_data[k:k+batch_size] for k in xrange(0, training_data_size, batch_size)]
for batch in batches:
self.update(batch, eta)
if test_data:
p1 = self.evaluate(test_data)
p2 = test_data_size
print "Epoch {0}: {1} / {2} = {3}".format(i, p1, p2, p1*1.0/p2)
print "Epoch {0} complete".format(i)
def update(self, batch, eta):
batch_size = len(batch)
nabla_weights = [np.zeros(w.shape) for w in self.weights]
nabla_biases = [np.zeros(b.shape) for b in self.biases]
#这里是同时计算多个训练数据的Cost关于w, b的偏导值, 由于我们的batch大小为1,所以nw(nb)与dnw(dnb)相同
for x, y in batch:
delta_nabla_weights, delta_nabla_biases = self.backprop(x, y)
nabla_weights = [nw+dnw for nw, dnw in zip(nabla_weights, delta_nabla_weights)]
nabla_biases = [nb+dnb for nb, dnb in zip(nabla_biases, delta_nabla_biases)]
#在这里更新w, b
self.weights = [w-(eta/batch_size)*nw for w, nw in zip(self.weights, nabla_weights)]
self.biases = [b-(eta/batch_size)*nb for b, nb in zip(self.biases, nabla_biases)]
#进行back propagation计算
def backprop(self, x, y):
activation = x
activations = [x]
zs = []
for weight, bias in zip(self.weights, self.biases):
z = np.dot(activation, weight)+bias
activation = sigmoid(z)
delta = (activation-y)*sigmoid_prime(zs[-1])
#初始化Cost关于w, b的偏导
nabla_weights = [np.zeros(w.shape) for w in self.weights]
nabla_biases = [np.zeros(b.shape) for b in self.biases]
nabla_weights[-1] = np.dot(activations[-2].transpose(), delta)
nabla_biases[-1] = delta
#反向计算每一层的delta,并且算出Cost关于w, b的偏导值
for l in xrange(2, len(self.layers)):
delta = np.dot(delta, self.weights[-l+1].transpose())*sigmoid_prime(zs[-l])
nabla_weights[-l] = np.dot(activations[-l-1].transpose(), delta)
nabla_biases[-l] = delta
return (nabla_weights, nabla_biases)
def evaluate(self, test_data):
result = [(np.argmax(self.feedforward(x)), y) for (x, y) in test_data]
return sum(int(x==y) for x, y in result)
this is the output of main.py processing data set A
features: only tf-idf
features dimension: 300
network parameter: epoch: 20, learning rate: 1.0, layers: [300, 50, 8]
Epoch 0: 1777 / 2009 = 0.884519661523
Epoch 1: 1842 / 2009 = 0.9168740667
Epoch 2: 1870 / 2009 = 0.93081134893
Epoch 3: 1879 / 2009 = 0.935291189647
Epoch 4: 1902 / 2009 = 0.946739671478
Epoch 5: 1894 / 2009 = 0.942757590841
Epoch 6: 1870 / 2009 = 0.93081134893
Epoch 7: 1898 / 2009 = 0.94474863116
Epoch 8: 1892 / 2009 = 0.941762070682
Epoch 9: 1917 / 2009 = 0.954206072673
Epoch 10: 1915 / 2009 = 0.953210552514
Epoch 11: 1922 / 2009 = 0.956694873071
Epoch 12: 1915 / 2009 = 0.953210552514
Epoch 13: 1915 / 2009 = 0.953210552514
Epoch 14: 1901 / 2009 = 0.946241911399
Epoch 15: 1925 / 2009 = 0.95818815331
Epoch 16: 1909 / 2009 = 0.950223992036
Epoch 17: 1913 / 2009 = 0.952215032354
Epoch 18: 1919 / 2009 = 0.955201592832
Epoch 19: 1922 / 2009 = 0.956694873071
cost time(exclude preprocess time): 19.9823410511 seconds
this is the output of main.py processing data set B
features: tf-idf with rare word count
features dimension: 320
network parameter: epoch: 20, learning rate: 1.0, layers: [320, 50, 20]
Epoch 0: 2084 / 5645 = 0.369176262179
Epoch 1: 3164 / 5645 = 0.560496014172
Epoch 2: 3324 / 5645 = 0.588839681134
Epoch 3: 3378 / 5645 = 0.598405668733
Epoch 4: 3575 / 5645 = 0.63330380868
Epoch 5: 3572 / 5645 = 0.632772364925
Epoch 6: 3634 / 5645 = 0.643755535872
Epoch 7: 3629 / 5645 = 0.64286979628
Epoch 8: 3599 / 5645 = 0.637555358725
Epoch 9: 3659 / 5645 = 0.648184233835
Epoch 10: 3675 / 5645 = 0.651018600531
Epoch 11: 3686 / 5645 = 0.652967227635
Epoch 12: 3689 / 5645 = 0.653498671391
Epoch 13: 3685 / 5645 = 0.652790079717
Epoch 14: 3692 / 5645 = 0.654030115146
Epoch 15: 3693 / 5645 = 0.654207263065
Epoch 16: 3729 / 5645 = 0.660584588131
Epoch 17: 3688 / 5645 = 0.653321523472
Epoch 18: 3681 / 5645 = 0.652081488043
Epoch 19: 3698 / 5645 = 0.655093002657
cost time(exclude preprocess time): 26.7541799545 seconds
this is the output of mainzh.py processing data set C
features: tf-idf with rare word count
features dimension: 308
network parameter: epoch: 20, learning rate: 1.0, layers: [308, 50, 8]
Epoch 0: 58 / 80 = 0.725
Epoch 1: 66 / 80 = 0.825
Epoch 2: 70 / 80 = 0.875
Epoch 3: 71 / 80 = 0.8875
Epoch 4: 71 / 80 = 0.8875
Epoch 5: 71 / 80 = 0.8875
Epoch 6: 70 / 80 = 0.875
Epoch 7: 71 / 80 = 0.8875
Epoch 8: 71 / 80 = 0.8875
Epoch 9: 71 / 80 = 0.8875
Epoch 10: 70 / 80 = 0.875
Epoch 11: 70 / 80 = 0.875
Epoch 12: 72 / 80 = 0.9
Epoch 13: 72 / 80 = 0.9
Epoch 14: 72 / 80 = 0.9
Epoch 15: 72 / 80 = 0.9
Epoch 16: 72 / 80 = 0.9
Epoch 17: 72 / 80 = 0.9
Epoch 18: 72 / 80 = 0.9
Epoch 19: 72 / 80 = 0.9
cost time(exclude preprocess time): 9.03512907028 seconds
数据集 | 准确率 | 用时/秒 |
附录A | 95.67% | 19.98 |
附录B | 65.51% | 26.75 |
附录C | 90.00% | 9.04 |
[1]Michael A. Nielsen. "Neural Networks and Deep Learning", Determination Press, 2015
[2]Steven Bird, Ewan Klein, Edward Loper. "Natural Language Processing with Python", O'Reilly Media, 2009.