@knight
2016-06-03T09:34:14.000000Z
字数 1823
阅读 2918
NLP
古德-图灵的基本思路是:对于任何一个出现了r次的n元语法,都假设它出现了次,这里有:
其中,是训练语料中恰好出现r次的n元语法的数目。把这个统计计数转化为概率,需要归一化处理,对于统计计数为r的n元语法,其概率为:
其中:
也就是说这个N等于这个分布中最初的计数。这样,样本中所有事件的概率之和为:
因此,有的概率剩余量可以分配给所有未出现事件(r=0)
# snownlp 中good-turing实现# -*- coding: utf-8 -*-from __future__ import print_functionfrom __future__ import divisionfrom math import log, expdef getz(r, nr):z = [2*nr[0]/r[1]]for i in xrange(len(nr)-2):z.append(2*nr[i+1]/(r[i+2]-r[i]))z.append(nr[-1]/(r[-1]-r[-2]))return zdef least_square(x, y): # y=a+bxmeanx = sum(x)/len(x)meany = sum(y)/len(y)xy = sum((x[i]-meanx)*(y[i]-meany) for i in range(len(x)))square = sum((x[i]-meanx)**2 for i in range(len(x)))b = xy/squarereturn (meany-b*meanx, b)def main(dic):values = sorted(dic.values())r, nr, prob = [], [], []for v in values:if not r or r[-1] != v:r.append(v)nr.append(1)else:nr[-1] += 1rr = dict(map(lambda x:list(reversed(x)), enumerate(r)))total = reduce(lambda x, y:(x[0]*x[1]+y[0]*y[1], 1), zip(nr, r))[0]z = getz(r, nr)a, b = least_square(map(lambda x:log(x), r), map(lambda x:log(x), z))use_good_turing = Falsenr.append(exp(a+b*log(r[-1]+1)))for i in xrange(len(r)):good_turing = (r[i]+1)*(exp(b*(log(r[i]+1)-log(r[i]))))turing = (r[i]+1)*nr[i+1]/nr[i] if i+1<len(r) else good_turingdiff = ((((r[i]+1)**2)/nr[i]*nr[i+1]/nr[i]*(1+nr[i+1]/nr[i]))**0.5)*1.65if not use_good_turing and abs(good_turing-turing)>diff:prob.append(turing)else:use_good_turing = Trueprob.append(good_turing)sump = reduce(lambda x, y:(x[0]*x[1]+y[0]*y[1], 1), zip(nr, prob))[0]for cnt, i in enumerate(prob):prob[cnt] = (1-nr[0]/total)*i/sumpreturn nr[0]/total/total, dict(zip(dic.keys(), map(lambda x:prob[rr[x]], dic.values())))if __name__ == '__main__':print(main({1:1,2:1,3:1,4:2,5:2,6:3,7:1,8:2,9:3}))