@hainingwyx
2018-04-26T08:37:05.000000Z
字数 8674
阅读 1902
Python 爬虫
描述:匹配所有符合规律的内容,返回包含结果的列表
| 表达式 | 功能 |
|---|---|
| re.findall('x.', code) | 返回两个长度的字符串组成的列表 |
| re.findall('x*', code) | 返回字符'x'的匹配列表,不匹配为空字符串 |
| re.findall('xx.*x', code) | 返回以xx开始和以x为结束的最大字符串组成的列表 |
| re.findall('xx.*?xx', code) | 返回以xx开始和以xx为结束的所有字符串组成的列表 |
| re.findall('xx(.*?)xx', code) | 返回所有xx开始和xx结束中间的所有内容为字符串组成的列表,换行重新开始 |
| re.findall('xx(.*?)xx', code, re.S) | 返回所有xx开始和xx结束中间的所有内容为字符串组成的列表,换行不影响 |
| re.findall('xx(.?)xx(.?)xx', code) | 返回满足该格式的两个字符串组成的元祖列表 |
| re.findall('(\d+)', code) | 返回所有纯数字组成的字符串列表 |
描述:search 匹配并提取第一个符合规律的内容,返回一个正则表达式对象
| 表达式 | 功能 |
|---|---|
| re.search('xx(.?)xx(.?)xx', code) | 返回对象 |
| re.search('xx(.?)xx(.?)xx', code).group(1) | 返回符合该格式的第1个字符串,2则为第二个字符串 |
描述:sub 替换符合规律的内容,返回替换后的值
| 表达式 | 功能 |
|---|---|
| re.sub('xx(.*?)xx', 'bilibili', code) | 引号内的内容用新字符串 |
| n = re.sub('xx(.*?)xx', 'bili%d'%123, code) | 引号内的内容用新字符串+数字代替 |
# 只有一个title的情况下title = re.search('<title>(.*?)</title>', html, re.S).group(1)# 爬链接links = re.findall("<a href = '(.*?)'>", html, re.S)
# 爬慕课网上前端开发课程中关于JS的所有课程封面图import re, requestslink = "http://www.imooc.com/course/list?c=javascript&page=1"container = []for i in range(1, 4):new_link = re.sub('page=\d+', 'page=%d'%i, link)html = requests.get(new_link)field = re.findall('<div class="moco-course-wrap">(.*?)</div>', html.text, re.S)for field_1 in field:pic_links = re.findall('src="(.*?)" height="124"', field_1, re.S)if len(pic_links) != 0:container.append(pic_links[0])# 需要事先在父目录中建立好文件夹"pic from imooc"num = 1for pic_link in container:print 'Downloading...', pic_linkpic = requests.get(pic_link)save = open('pic from imooc\\' + str(num) + '.jpg', 'wb')save.write(pic.content)save.close()num += 1
##爬日语学习的标题import requestshtml = requests.get('http://tieba.baidu.com/f?ie=utf-8&kw=%E6%97%A5%E6%9C%AC')#print html.text# 通过Network获取headersheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}html = requests.get('http://jp.tingroom.com/rumen/ryrumen/', headers = headers)html.encoding = 'utf-8'#print html.textfield = re.findall('<li style=" font-size:14px;(.*?)</li>', html.text, re.S)for item in field:title = re.search('style="color: #039;">(.*?)</a>', item, re.S).group(1)sub_title = re.search('style="color:#666666;">(.*?)</span>', item, re.S).group(1)print format(title, '40'), '\t', sub_title
# 爬取极客学院课程信息import requests, reclass spider(object):def source(self, url):html = requests.get(url)return html.textdef pages(self, url, total_page):now_page = int(re.search('pageNum=(\d+)', url, re.S).group(1))page_group = []for i in range(now_page, total_page + 1):link = re.sub('pageNum=\d+', 'pageNum=%s' %i, url, re.S)page_group.append(link)return page_groupdef get_class(self, source):every_class = re.findall('style="height: 88px;">(.*?)</div>', source, re.S)return every_classdef getinfo(self, eachclass):info = {}info['title'] = re.findall('>(.*?)</a></h2>', eachclass, re.S)[0].split('>')[1]info['intro'] = re.search('display: none;">(.*?)</p>', eachclass, re.S).group(1)info['people'] = re.search('<em class="learn-number">(.*?)</em>', eachclass, re.S).group(1).strip(" ")detail_field = re.findall('<em>(.*?)</em>', eachclass, re.S)info['time'] = detail_field[0].strip(" ")info['level'] = detail_field[1].strip(" ")return infodef saveinfo(self, classinfo):f = open('jikexueyuan.txt', 'a')for each in classinfo:f.writelines('title: ' + each['title'].encode('utf-8') + '\n')f.writelines('content: ' + each['intro'].strip('\n').strip(' ').encode('utf-8') + '\n')f.writelines('learn_num: ' + each['people'].encode('utf-8') + '\n')f.writelines('classtime: ' + each['time'].strip('\n').encode('utf-8') + '\n')f.writelines('classlevel: ' + each['level'].encode('utf-8') + '\n\n')f.close()if __name__ == "__main__":classinfo = []url = "http://www.jikexueyuan.com/course/?pageNum=1"spider_new = spider()pages = int(raw_input('How many pages do you want? '))all_links = spider_new.pages(url, pages)for link in all_links:print '正在爬取……' + linkhtml = spider_new.source(link)everyclass = spider_new.get_class(html)for each in everyclass:info = spider_new.getinfo(each)classinfo.append(info)spider_new.saveinfo(classinfo)
from bs4 import BeautifulSoupf = open('test.html', 'r')content = f.read()soup = BeautifulSoup(content, 'html.parser')print soupprint soup.prettify() #输出格式美化#select方法:class对应点号;id对应井号soup.select('.sister') #返回class为sister所组成的列表soup.select('#link1') #返回id为link1所组成的列表soup.select('#link1')[0]['href'] # 提取链接 ['href']soup.select('#link1')[0].text # 提取文字 .textsoup('p')soup.find_all('p') #两者等价for text in [item.text for item in soup('p')]: print textsoup('head')soup('body')soup('title')soup.title.textsoup.title.name #打印属性名称for tag in soup.find_all(True):print tag.name# 只返回第一次遇见的值soup.psoup.find_all('p') #返回多次所遇见的值soup.p['class']soup.find_all('p', {"class": "story"}) #返回多次p开头,class为storysoup.find_all('p', {'class': 'story'})[0].find_all('a')soup.asoup.find_all('a', {'id': 'link3'})soup.find(id = 'link3')soup.find_all(['a', 'b']) #返回开头为a或者b的列表soup.get_text().split('\n')
# 微信公众号信息爬取url = 'http://mp.weixin.qq.com/s?__biz=MzIxNTQ4NzAwNA==&mid=2247484008&idx=1&sn=dfa8a4a371dfbf4c2aa33574b2e99a25&scene=1&\srcid=0824cvxNVK6X3pDWyK08Byhj#rd'content = requests.get(url)soup = BeautifulSoup(content.text, 'html.parser')print soup.find('h2', {'class': 'rich_media_title'}).text.strip()print soup.find('div', {'class': 'rich_media_meta_list'}).text.strip().replace('\n', ' ')print soup.find('em').textprint soup.find('div', {'class': 'rich_media_content'}).text# 基本信息field = soup.find('div', {'class': 'rich_media_meta_list'})title = soup.find('h2', {'class': 'rich_media_title'}).get_text().strip()date = field.find(id = 'post-date').get_text()content = soup.find('div', {'class': 'rich_media_content'}).get_text()print titleprint dateprint content
from lxml import etreehtml = open('demo_1.html', 'r').read()selector = etree.HTML(html)content = selector.xpath('/ /ul[@id="good"]/li/text()')for i in content:print icontent_1 = selector.xpath('//ul/li/text()')for i in content_1:print ilink = selector.xpath('//a/@href')for i in link:print i
from lxml import etreehtml = open('demo_2.html').read()print htmlselector = etree.HTML(html)content_1 = selector.xpath('//body/div[1]/text()')for content in content_1: print contentattr_2 = selector.xpath('//body/div[2]/@id')for attr in attr_2: print attrcontent = selector.xpath('//div[starts-with(@id, "test")]/text()')for item in content: print itemattrs = selector.xpath('//div[starts-with(@id, "test")]/@id')for attr in attrs: print attrhtml = open('demo_3.html', 'r').read()print htmlselector = etree.HTML(html)content = selector.xpath('//div[starts-with(@id, "test")]/text()')for i in content: print iselector = etree.HTML(html)content = selector.xpath('//span[starts-with(@id, "test")]/text()')for i in content: print ifield = selector.xpath('//div[@id="test"]')[0]content = field.xpath('string(.)')print content.replace('\n', '')
# 爬取百度贴吧中“Python吧”前50页的内容# coding:utf-8import requestsfrom lxml import etreeurl = 'http://tieba.baidu.com/f?kw=python&ie=utf-8&pn='lst = []url_1 = 'http://tieba.baidu.com'page = int(raw_input(u'输入50的倍数:'))for i in range(0, page, 50):lst.append(url + str(i))with open('level_1.txt', 'a') as f:for item in lst:link, time = ' ', ' 'print u'正在抓取...' + str(item)html_1 = requests.get(item)selector_1 = etree.HTML(html_1.text)field = selector_1.xpath('//div[@class="t_con cleafix"]')for each in field:title = each.xpath('div[@class="col2_right j_threadlist_li_right "]/div')[0].xpath('string(.)').strip().split('\n')[0].strip()reply_num = each.xpath('div[@class="col2_left j_threadlist_li_left"]/span/text()')[0]author = each.xpath('div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span/@title')[0].split(':')[1]try:time = each.xpath('div[@class="col2_right j_threadlist_li_right "]/div')[0].xpath('string(.)').strip().split('\n')[3].strip('\n')except Exception, e:print etime = ' 'if len(each.xpath('div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/@href')) != 0:link = url_1 + each.xpath('div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/@href')[0]elif len(each.xpath('div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit member_thread_title_frs "]/a/@href')) != 0:link = url_1 + each.xpath('div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit member_thread_title_frs "]/a/@href')[0]f.write(title.encode('utf-8') + '\t' + str(reply_num) + '\t' + str(link) + '\t' + author.encode('utf-8') + '\t' + time.encode('utf-8'))f.write('\n')# 作为抓取提示标识print time, title