[关闭]
@Scrazy 2017-04-01T13:44:15.000000Z 字数 5191 阅读 890

新浪博客文本聚类

python


前言

这是年前帮一位 QQ 好友完成论文报告所写的程序。有待完善。


以下是本次报告所使用的程序, 全部使用 Python 编写。根据需要,编写了以下四个程序。

  1. article_base_info.py 用于抓取文章的基本信息:文章标题、链接、作者、发表日期
  2. article_content_gevent.py 用于抓取文章内容
  3. text_category.py 对文章进行分类
  4. format_data.py 格式化数据

下面是程序代码

  1. # coding: utf-8
  2. '''
  3. 程序: article_base_info.py
  4. 1. 此程序通过给定的页数抓取新浪博客文章的基本信息:文章标题、链接、作者、发表日期
  5. 2. 数据保存到MongoDB中
  6. '''
  7. import re
  8. import concurrent.futures
  9. import requests
  10. from bs4 import BeautifulSoup as bs
  11. from pymongo import MongoClient
  12. def fetch(url):
  13. res = requests.get(url)
  14. res.encoding = 'gbk'
  15. content = bs(res.text, 'lxml')
  16. return content
  17. def base_info(html):
  18. pattern = re.compile(r'http://blog.sina.com.cn/s/blog_.*\.html')
  19. links = re.findall(pattern, str(html))
  20. date_ = re.findall(r'\((\d{2,}.*)\)', str(html))
  21. tle_auth = html.select('li')
  22. authes = (auth.text.split(' ')[0] for auth in tle_auth)
  23. titles = (title.text.split(' ')[-1] for title in tle_auth)
  24. for infos in zip(links, titles, authes, date_):
  25. yield infos
  26. def save(url):
  27. html = fetch(url)
  28. data = base_info(html)
  29. client = MongoClient('localhost', 27017)
  30. db = client.infos
  31. coll = db.coll
  32. for num, d in enumerate(data, 1):
  33. datum = {
  34. 'links': d[0],
  35. 'title': d[1],
  36. 'auther': d[2],
  37. 'date': d[3]
  38. }
  39. count = coll.find({'links': d[0]}).count()
  40. if count == 0:
  41. coll.insert_one(datum)
  42. print('{} is grabbed'.format(urls))
  43. if __name__ == '__main__':
  44. url = 'http://roll.blog.sina.com.cn/list/other/index_{}.shtml'
  45. start = int(input('请输入开始页数, 默认为1 >> '))
  46. if not start:
  47. start = 1
  48. end = int(input('输入结束页数, 默认为100 >> '))
  49. if not end:
  50. end = 100
  51. pages = range(start, end + 1)
  52. urls = [url.format(page) for page in pages]
  53. with concurrent.futures.ProcessPoolExecutor(max_workers=6) as executor:
  54. executor.map(save, urls)

  1. # -*-coding: utf-8 -*-
  2. '''
  3. 程序: article_content_gevent.py
  4. 1. 此程序是用来抓取新浪博客的文章内容的!
  5. 2.文章链接从 *筛选后所有博客数据.csv* 读取, 此 .csv 文件由 article_base_info.py 抓取生成。
  6. 3. 由于作者删帖或者其他方面的原因,导致文章本身已不存在而其链接仍存在的现象。
  7. '''
  8. import os
  9. import csv
  10. import logging
  11. import requests
  12. import gevent
  13. from bs4 import BeautifulSoup as bs
  14. def fetch(url):
  15. res = requests.get(url)
  16. res.encoding = 'utf-8'
  17. content = bs(res.text, 'lxml')
  18. if not content:
  19. logging.warning('The blog have been deleted!')
  20. return content
  21. def content_get(html):
  22. try:
  23. artical = html.select('#sina_keyword_ad_area2')[0].text.strip()
  24. except IndexError as e:
  25. print(e)
  26. logging.warning('the page is None')
  27. artical = ' '
  28. return artical
  29. def links_get(filename, urls=None):
  30. with open(filename, 'r') as csvfile:
  31. logging.info('readed the file {}'.format(filename))
  32. reader = csv.reader(csvfile)
  33. if urls is None:
  34. urls = []
  35. urls = [row[0] for row in reader]
  36. return urls
  37. def download(url):
  38. html = fetch(url)
  39. artical = content_get(html)
  40. with open('/home/mouse/Documents/artical/{}.txt'
  41. .format(url[-12:-5]), 'w') as f:
  42. f.write(artical)
  43. logging.info('writring the {}'.format(url))
  44. if __name__ == '__main__':
  45. logging.basicConfig(format='%(asctime)s %(message)s',
  46. level=logging.WARNING)
  47. filename = '/home/mouse/我的坚果云/董姐的论文所需/筛选后所有博客数据.csv'
  48. urls = links_get(filename)
  49. if not os.path.isdir('/home/mouse/Documents/artical/'):
  50. os.makedirs('/home/mouse/Documents/artical/')
  51. threads = [gevent.spawn(download, url) for url in urls]
  52. gevent.joinall(threads)

  1. # coding=utf-8
  2. '''
  3. 程序: text_category.py
  4. 1. 此程序用于对从新浪博客抓取的文章进行自动分类
  5. 2. 分类所使用的库来自 https://github.com/2shou/TextGrocery
  6. 3. 工作流 读取已抓取的文章标题 -> 对文章分类 -> 写入分类后的文章标题
  7. '''
  8. import os
  9. import csv
  10. from tgrocery import Grocery
  11. from train_txt import train_src
  12. def category(title_lst, cates=None): # 对文章分类
  13. if cates is None:
  14. cates = []
  15. for title in title_lst:
  16. cate = new_grocery.predict(title)
  17. cates.append(cate.predicted_y)
  18. return cates
  19. def get_artical_title(filename, title_lst=None): # 读取文本
  20. if title_lst is None:
  21. title_lst = []
  22. with open(filename, 'r') as f1:
  23. f1_csv = csv.reader(f1)
  24. title_lst = [row[1] for row in f1_csv]
  25. return title_lst
  26. def write_cated_info(filename, new_filename): # 写入已分类的文章
  27. titles = get_artical_title(filename)
  28. categ = category(titles)
  29. with open(filename, 'r') as read_file:
  30. reader = csv.reader(read_file)
  31. for i, row in enumerate(reader):
  32. row.append(categ[i])
  33. with open(new_filename, 'a+') as write_file:
  34. writer = csv.writer(write_file)
  35. writer.writerow(row)
  36. print 'writing the {} item'.format(i)
  37. print 'Done....................'
  38. if __name__ == "__main__":
  39. # filename和new_filename是文件路径,保存读取和写入的文件
  40. # 更改路径名即可对不同的数据分类,前提要符合一定格式
  41. filename = '/home/mouse/我的坚果云/董姐的论文所需/female7.csv'
  42. new_filename = '/home/mouse/我的坚果云/董姐的论文所需/female7_2.csv'
  43. if os.path.isfile(new_filename):
  44. os.remove(new_filename)
  45. grocery = Grocery('sample')
  46. grocery.train(train_src)
  47. grocery.save()
  48. new_grocery = Grocery('sample')
  49. new_grocery.load()
  50. write_cated_info(filename, new_filename)

  1. # -*- coding: utf-8 -*-
  2. '''
  3. 程序: format_data.py
  4. 此程序是一个辅助程序,用于对 *标题整理数据.xlsx* 的格式化,标题整理数据转换为 csv 格式
  5. '''
  6. import csv
  7. from collections import namedtuple
  8. cate = ['社会冲突和问题', '毛泽东思想与政策', '政党与政府设置', '民主与法治', '民族和国际关系',
  9. '媒体与言论自由', '资本主义与市场经济', '全球化和对外开放', '民生与福利',
  10. '家庭冲突与伦理', '传统文化', '性与个人自由', '环境污染', '生态保护', ]
  11. Category = namedtuple(
  12. 'Category', 'social mao govm demcy nation media capi glob live home tran sex env eco')
  13. filename = '/home/mouse/我的坚果云/董姐的论文所需/标题整理数据2.csv'
  14. def train_text(filename, train_src=None):
  15. if train_src is None:
  16. train_src = []
  17. def format_cate():
  18. for emp in map(Category._make, csv.reader(open(filename, 'r'))):
  19. social = (cate[0], emp.social)
  20. mao = (cate[1], emp.mao)
  21. govm = (cate[2], emp.govm)
  22. demcy = (cate[3], emp.demcy)
  23. nation = (cate[4], emp.nation)
  24. media = (cate[5], emp.media)
  25. capi = (cate[6], emp.capi)
  26. glob = (cate[7], emp.glob)
  27. live = (cate[8], emp.live)
  28. home = (cate[9], emp.home)
  29. tran = (cate[10], emp.tran)
  30. sex = (cate[11], emp.sex)
  31. env = (cate[12], emp.env)
  32. eco = (cate[13], emp.eco)
  33. yield social, mao, govm, demcy, nation, media, capi, glob, \
  34. live, home, tran, sex, env, eco
  35. for cat in format_cate():
  36. train_src.extend(list(cat))
  37. return train_src

以上程序均有本人编写,并且全部我的电脑上运行通过,但未在其他电脑和平台上测试,由于各种依赖和兼容性问题以及本人水平有限,不保证他人也能正常运行此程序。

添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注