@Wayne-Z 2017-11-19T02:18:09.000000Z 字数 8505 阅读 3081


NLP word2dvec




英文wiki语料链接 https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
中文wiki语料链接 https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2
text8链接 http://mattmahoney.net/dc/text8.zip



  1. __author__ = 'huang'
  2. import os
  3. import logging
  4. import sys
  5. from gensim.corpora import WikiCorpus
  6. if __name__=='__main__':
  7. program = os.path.basename(sys.argv[0])
  8. logger = logging.getLogger(program)
  9. logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
  10. logging.root.setLevel(level=logging.INFO)
  11. if len(sys.argv) < 3:
  12. print(globals()['__doc__'] %locals())
  13. sys.exit(1)
  14. inp, outp = sys.argv[1:3]
  15. space = ' '
  16. i = 0
  17. output = open(outp, 'w')
  18. wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
  19. for text in wiki.get_texts():
  20. output.writespace.join(text) + '\n')
  21. i = i + 1
  22. if i % 10000 == 0:
  23. logger.info('Saved ' + str(i) + ' articles')
  24. output.close()
  25. logger.info('Finished ' + str(i) + ' articles')


  1. python process_wiki.py enwiki-latest-pages-articles.xml.bz2 wiki.en.text


  1. (C:\Anaconda3) E:\NLP\word2vec-for-wiki-master>python process_wiki.py enwiki-latest-pages-articles.xml.bz2 wiki.en.text
  2. Traceback (most recent call last):
  3. File "process_wiki.py", line 30, in <module>
  4. output.write(space.join(text).decode() + '\n')
  5. TypeError: sequence item 0: expected str instance, bytes found


  1. Traceback (most recent call last):
  2. File "process_wiki.py", line 30, in <module>
  3. output.write(bytes.join(space,text).decode() + '\n')
  4. TypeError: descriptor 'join' requires a 'bytes' object but received a 'str'
  5. (C:\Anaconda3) E:\NLP\word2vec-for-wiki-master>python process_wiki.py enwiki-latest-pages-articles.xml.bz2 wiki.en.text
  6. Traceback (most recent call last):
  7. File "process_wiki.py", line 30, in <module>
  8. output.write(bytes.join(space.encode(),text).decode() + '\n')
  9. UnicodeEncodeError: 'gbk' codec can't encode character '\u1f00' in position 1714: illegal multibyte sequence
  10. (C:\Anaconda3) E:\NLP\word2vec-for-wiki-master>python process_wiki.py enwiki-latest-pages-articles.xml.bz2 wiki.en.text
  11. Traceback (most recent call last):
  12. File "process_wiki.py", line 30, in <module>
  13. output.write(bytes.join(''.encode(),text).decode() + '\n')
  14. UnicodeEncodeError: 'gbk' codec can't encode character '\u1f00' in position 1474: illegal multibyte sequence
  15. (C:\Anaconda3) E:\NLP\word2vec-for-wiki-master>python process_wiki.py enwiki-latest-pages-articles.xml.bz2 wiki.en.text
  16. Traceback (most recent call last):
  17. File "process_wiki.py", line 30, in <module>
  18. output.write(bytes.join(b'',text).decode() + '\n')
  19. UnicodeEncodeError: 'gbk' codec can't encode character '\u1f00' in position 1474: illegal multibyte sequence


  1. space = ' '.encode()


  1. data = sapce.join(text)
  2. print(data)
  3. #output.write(str(data) + '\n')


  1. data = sapce.join(text)
  2. output.write(str(data) + '\n')


  1. (C:\Anaconda3) E:\NLP\word2vec-for-wiki-master>python process_wiki.py enwiki-latest-pages-articles.xml.bz2 wiki.en.text
  2. 2016-07-28 10:48:11,057: INFO: Saved 10000 articles
  3. 2016-07-28 10:49:44,660: INFO: Saved 20000 articles
  4. 2016-07-28 10:51:04,023: INFO: Saved 30000 articles
  5. 2016-07-28 10:52:13,199: INFO: Saved 40000 articles
  6. 2016-07-28 10:53:07,548: INFO: Saved 50000 articles
  7. 2016-07-28 10:53:45,695: INFO: Saved 60000 articles
  8. 2016-07-28 10:54:18,993: INFO: Saved 70000 articles
  9. 2016-07-28 10:54:51,188: INFO: Saved 80000 articles
  10. 2016-07-28 10:55:50,520: INFO: Saved 90000 articles
  11. ·
  12. ·
  13. ·
  14. ·
  15. 2016-07-28 15:24:22,182: INFO: Saved 4040000 articles
  16. 2016-07-28 15:25:09,770: INFO: Saved 4050000 articles
  17. 2016-07-28 15:25:46,915: INFO: Saved 4060000 articles
  18. 2016-07-28 15:26:24,892: INFO: Saved 4070000 articles
  19. 2016-07-28 15:27:05,343: INFO: Saved 4080000 articles
  20. 2016-07-28 15:27:48,280: INFO: Saved 4090000 articles
  21. 2016-07-28 15:28:22,146: INFO: finished iterating over Wikipedia corpus of 4099408 documents with 2229304913 positions (total 16753779 articles, 2290359456 positions before pruning articles shorter than 50 words)
  22. 2016-07-28 15:28:22,155: INFO: Finished 4099408 articles


  1. python train_word2vec_model.py wiki.en.text wiki.en.text.model wiki.en.text.vector


  1. 2016-07-28 15:47:35,297: INFO: running train_word2vec_model.py wiki.en.text wiki.en.text.model wiki.en.text.vector
  2. 2016-07-28 15:47:35,302: INFO: collecting all words and their counts
  3. 2016-07-28 15:47:35,370: INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
  4. 2016-07-28 15:48:05,500: INFO: PROGRESS: at sentence #10000, processed 29336126 words, keeping 434884 word types
  5. 2016-07-28 15:48:39,042: INFO: PROGRESS: at sentence #20000, processed 55594275 words, keeping 628122 word types


  1. python process_wiki.py zhwiki-latest-pages-articles.xml.bz2 wiki.zh.text
  1. E:\NLP\word2vec-for-wiki-master>python process_wiki.py zhwiki-latest-pages-articles.xml.bz2 wiki.zh.text
  2. 2016-07-28 16:28:21,686: INFO: Saved 10000 articles
  3. 2016-07-28 16:29:07,536: INFO: Saved 20000 articles


  1. python train_word2vec_model.py text8 text8.model text8.vector.


  1. 2016-07-28 20:03:42,295: INFO: PROGRESS: at 99.82% examples, 405001 words/s, in_qsize 12, out_qsize 3
  2. 2016-07-28 20:03:42,435: INFO: worker thread finished; awaiting finish of 7 more threads
  3. 2016-07-28 20:03:42,445: INFO: worker thread finished; awaiting finish of 6 more threads
  4. 2016-07-28 20:03:42,445: INFO: worker thread finished; awaiting finish of 5 more threads
  5. 2016-07-28 20:03:42,445: INFO: worker thread finished; awaiting finish of 4 more threads
  6. 2016-07-28 20:03:42,465: INFO: worker thread finished; awaiting finish of 3 more threads
  7. 2016-07-28 20:03:42,495: INFO: worker thread finished; awaiting finish of 2 more threads
  8. 2016-07-28 20:03:42,495: INFO: worker thread finished; awaiting finish of 1 more threads
  9. 2016-07-28 20:03:42,505: INFO: worker thread finished; awaiting finish of 0 more threads
  10. 2016-07-28 20:03:42,505: INFO: training on 85026035 raw words (62532401 effective words) took 154.3s, 405163 effective words/s
  11. 2016-07-28 20:03:42,505: INFO: saving Word2Vec object under text8.model, separately None
  12. 2016-07-28 20:03:42,505: INFO: storing numpy array 'syn0' to text8.model.syn0.npy
  13. 2016-07-28 20:03:43,506: INFO: not storing attribute syn0norm
  14. 2016-07-28 20:03:43,506: INFO: not storing attribute cum_table
  15. 2016-07-28 20:03:43,506: INFO: storing numpy array 'syn1neg' to text8.model.syn1neg.npy
  16. 2016-07-28 20:03:45,225: INFO: storing 71290x400 projection weights into text8.vector.


  1. In [1]: import gensim
  2. In [2]: model = gensim.models.Word2Vec.l
  3. gensim.models.Word2Vec.load gensim.models.Word2Vec.log_accuracy
  4. gensim.models.Word2Vec.load_word2vec_format
  5. In [2]: model = gensim.models.Word2Vec.looad('text8.model')
  6. In [3]: model = gensim.models.Word2Vec.load('text8.model')
  7. In [4]: model.mo
  8. model.most_similar model.most_similar_cosmul
  9. In [4]: model.most_similar('man')
  10. Out[4]:
  11. [('woman', 0.6650575399398804),
  12. ('girl', 0.5865204334259033),
  13. ('creature', 0.5350353717803955),
  14. ('boy', 0.510942816734314),
  15. ('person', 0.5094308257102966),
  16. ('men', 0.5073959827423096),
  17. ('evil', 0.48292240500450134),
  18. ('totoro', 0.47985178232192993),
  19. ('god', 0.476554274559021),
  20. ('vanity', 0.47478240728378296)]
  21. In [5]: model.most_similar('girl')
  22. Out[5]:
  23. [('blonde', 0.7728073596954346),
  24. ('baby', 0.7689986824989319),
  25. ('kid', 0.7603048086166382),
  26. ('woman', 0.7313079833984375),
  27. ('girls', 0.7117128968238831),
  28. ('boy', 0.6976305246353149),
  29. ('joey', 0.6945637464523315),
  30. ('boys', 0.6894382238388062),
  31. ('bride', 0.685029149055481),
  32. ('rabbit', 0.6838369369506836)]


  1. opencc -help


  1. Open Chinese Convert (OpenCC) Command Line Tool
  2. Version 0.4.2
  3. Author: BYVoid <byvoid@byvoid.com>
  4. Bug Report: http://github.com/BYVoid/OpenCC/issues
  5. Usage:
  6. opencc [Options]
  7. Options:
  8. -i [file], --input=[file] Read original text from [file].
  9. -o [file], --output=[file] Write converted text to [file].
  10. -c [file], --config=[file] Load configuration of conversion from [file].
  11. -v, --version Print version and build information.
  12. -h, --help Print this help.
  13. With no input file, reads standard input and writes converted stream to standard output.
  14. Default configuration(zhs2zht.ini) will be loaded if not set.


  1. opencc -i wiki.zh.text -o wiki.zh.text.jian -c zht2zhs.ini


  1. pip install jieba


  1. python separate_words.py wiki.zh.text.jian wiki.zh.text.jian.seq


  1. python train_word2vec_model.py wiki.zh.text.jian

