@Scrazy
2017-04-01T13:44:15.000000Z
字数 5191
阅读 959
python
这是年前帮一位 QQ 好友完成论文报告所写的程序。有待完善。
以下是本次报告所使用的程序, 全部使用 Python 编写。根据需要,编写了以下四个程序。
article_base_info.py 用于抓取文章的基本信息:文章标题、链接、作者、发表日期article_content_gevent.py 用于抓取文章内容text_category.py 对文章进行分类format_data.py 格式化数据下面是程序代码
# coding: utf-8'''程序: article_base_info.py1. 此程序通过给定的页数抓取新浪博客文章的基本信息:文章标题、链接、作者、发表日期2. 数据保存到MongoDB中'''import reimport concurrent.futuresimport requestsfrom bs4 import BeautifulSoup as bsfrom pymongo import MongoClientdef fetch(url):res = requests.get(url)res.encoding = 'gbk'content = bs(res.text, 'lxml')return contentdef base_info(html):pattern = re.compile(r'http://blog.sina.com.cn/s/blog_.*\.html')links = re.findall(pattern, str(html))date_ = re.findall(r'\((\d{2,}.*)\)', str(html))tle_auth = html.select('li')authes = (auth.text.split(' ')[0] for auth in tle_auth)titles = (title.text.split(' ')[-1] for title in tle_auth)for infos in zip(links, titles, authes, date_):yield infosdef save(url):html = fetch(url)data = base_info(html)client = MongoClient('localhost', 27017)db = client.infoscoll = db.collfor num, d in enumerate(data, 1):datum = {'links': d[0],'title': d[1],'auther': d[2],'date': d[3]}count = coll.find({'links': d[0]}).count()if count == 0:coll.insert_one(datum)print('{} is grabbed'.format(urls))if __name__ == '__main__':url = 'http://roll.blog.sina.com.cn/list/other/index_{}.shtml'start = int(input('请输入开始页数, 默认为1 >> '))if not start:start = 1end = int(input('输入结束页数, 默认为100 >> '))if not end:end = 100pages = range(start, end + 1)urls = [url.format(page) for page in pages]with concurrent.futures.ProcessPoolExecutor(max_workers=6) as executor:executor.map(save, urls)
# -*-coding: utf-8 -*-'''程序: article_content_gevent.py1. 此程序是用来抓取新浪博客的文章内容的!2.文章链接从 *筛选后所有博客数据.csv* 读取, 此 .csv 文件由 article_base_info.py 抓取生成。3. 由于作者删帖或者其他方面的原因,导致文章本身已不存在而其链接仍存在的现象。'''import osimport csvimport loggingimport requestsimport geventfrom bs4 import BeautifulSoup as bsdef fetch(url):res = requests.get(url)res.encoding = 'utf-8'content = bs(res.text, 'lxml')if not content:logging.warning('The blog have been deleted!')return contentdef content_get(html):try:artical = html.select('#sina_keyword_ad_area2')[0].text.strip()except IndexError as e:print(e)logging.warning('the page is None')artical = ' 'return articaldef links_get(filename, urls=None):with open(filename, 'r') as csvfile:logging.info('readed the file {}'.format(filename))reader = csv.reader(csvfile)if urls is None:urls = []urls = [row[0] for row in reader]return urlsdef download(url):html = fetch(url)artical = content_get(html)with open('/home/mouse/Documents/artical/{}.txt'.format(url[-12:-5]), 'w') as f:f.write(artical)logging.info('writring the {}'.format(url))if __name__ == '__main__':logging.basicConfig(format='%(asctime)s %(message)s',level=logging.WARNING)filename = '/home/mouse/我的坚果云/董姐的论文所需/筛选后所有博客数据.csv'urls = links_get(filename)if not os.path.isdir('/home/mouse/Documents/artical/'):os.makedirs('/home/mouse/Documents/artical/')threads = [gevent.spawn(download, url) for url in urls]gevent.joinall(threads)
# coding=utf-8'''程序: text_category.py1. 此程序用于对从新浪博客抓取的文章进行自动分类2. 分类所使用的库来自 https://github.com/2shou/TextGrocery3. 工作流 读取已抓取的文章标题 -> 对文章分类 -> 写入分类后的文章标题'''import osimport csvfrom tgrocery import Groceryfrom train_txt import train_srcdef category(title_lst, cates=None): # 对文章分类if cates is None:cates = []for title in title_lst:cate = new_grocery.predict(title)cates.append(cate.predicted_y)return catesdef get_artical_title(filename, title_lst=None): # 读取文本if title_lst is None:title_lst = []with open(filename, 'r') as f1:f1_csv = csv.reader(f1)title_lst = [row[1] for row in f1_csv]return title_lstdef write_cated_info(filename, new_filename): # 写入已分类的文章titles = get_artical_title(filename)categ = category(titles)with open(filename, 'r') as read_file:reader = csv.reader(read_file)for i, row in enumerate(reader):row.append(categ[i])with open(new_filename, 'a+') as write_file:writer = csv.writer(write_file)writer.writerow(row)print 'writing the {} item'.format(i)print 'Done....................'if __name__ == "__main__":# filename和new_filename是文件路径,保存读取和写入的文件# 更改路径名即可对不同的数据分类,前提要符合一定格式filename = '/home/mouse/我的坚果云/董姐的论文所需/female7.csv'new_filename = '/home/mouse/我的坚果云/董姐的论文所需/female7_2.csv'if os.path.isfile(new_filename):os.remove(new_filename)grocery = Grocery('sample')grocery.train(train_src)grocery.save()new_grocery = Grocery('sample')new_grocery.load()write_cated_info(filename, new_filename)
# -*- coding: utf-8 -*-'''程序: format_data.py此程序是一个辅助程序,用于对 *标题整理数据.xlsx* 的格式化,标题整理数据转换为 csv 格式'''import csvfrom collections import namedtuplecate = ['社会冲突和问题', '毛泽东思想与政策', '政党与政府设置', '民主与法治', '民族和国际关系','媒体与言论自由', '资本主义与市场经济', '全球化和对外开放', '民生与福利','家庭冲突与伦理', '传统文化', '性与个人自由', '环境污染', '生态保护', ]Category = namedtuple('Category', 'social mao govm demcy nation media capi glob live home tran sex env eco')filename = '/home/mouse/我的坚果云/董姐的论文所需/标题整理数据2.csv'def train_text(filename, train_src=None):if train_src is None:train_src = []def format_cate():for emp in map(Category._make, csv.reader(open(filename, 'r'))):social = (cate[0], emp.social)mao = (cate[1], emp.mao)govm = (cate[2], emp.govm)demcy = (cate[3], emp.demcy)nation = (cate[4], emp.nation)media = (cate[5], emp.media)capi = (cate[6], emp.capi)glob = (cate[7], emp.glob)live = (cate[8], emp.live)home = (cate[9], emp.home)tran = (cate[10], emp.tran)sex = (cate[11], emp.sex)env = (cate[12], emp.env)eco = (cate[13], emp.eco)yield social, mao, govm, demcy, nation, media, capi, glob, \live, home, tran, sex, env, ecofor cat in format_cate():train_src.extend(list(cat))return train_src
以上程序均有本人编写,并且全部我的电脑上运行通过,但未在其他电脑和平台上测试,由于各种依赖和兼容性问题以及本人水平有限,不保证他人也能正常运行此程序。