@hainingwyx
2018-04-26T16:37:05.000000Z
字数 8674
阅读 1711
Python
爬虫
描述:匹配所有符合规律的内容,返回包含结果的列表
表达式 | 功能 |
---|---|
re.findall('x.', code) | 返回两个长度的字符串组成的列表 |
re.findall('x*', code) | 返回字符'x'的匹配列表,不匹配为空字符串 |
re.findall('xx.*x', code) | 返回以xx开始和以x为结束的最大字符串组成的列表 |
re.findall('xx.*?xx', code) | 返回以xx开始和以xx为结束的所有字符串组成的列表 |
re.findall('xx(.*?)xx', code) | 返回所有xx开始和xx结束中间的所有内容为字符串组成的列表,换行重新开始 |
re.findall('xx(.*?)xx', code, re.S) | 返回所有xx开始和xx结束中间的所有内容为字符串组成的列表,换行不影响 |
re.findall('xx(.?)xx(.?)xx', code) | 返回满足该格式的两个字符串组成的元祖列表 |
re.findall('(\d+)', code) | 返回所有纯数字组成的字符串列表 |
描述:search 匹配并提取第一个符合规律的内容,返回一个正则表达式对象
表达式 | 功能 |
---|---|
re.search('xx(.?)xx(.?)xx', code) | 返回对象 |
re.search('xx(.?)xx(.?)xx', code).group(1) | 返回符合该格式的第1个字符串,2则为第二个字符串 |
描述:sub 替换符合规律的内容,返回替换后的值
表达式 | 功能 |
---|---|
re.sub('xx(.*?)xx', 'bilibili', code) | 引号内的内容用新字符串 |
n = re.sub('xx(.*?)xx', 'bili%d'%123, code) | 引号内的内容用新字符串+数字代替 |
# 只有一个title的情况下
title = re.search('<title>(.*?)</title>', html, re.S).group(1)
# 爬链接
links = re.findall("<a href = '(.*?)'>", html, re.S)
# 爬慕课网上前端开发课程中关于JS的所有课程封面图
import re, requests
link = "http://www.imooc.com/course/list?c=javascript&page=1"
container = []
for i in range(1, 4):
new_link = re.sub('page=\d+', 'page=%d'%i, link)
html = requests.get(new_link)
field = re.findall('<div class="moco-course-wrap">(.*?)</div>', html.text, re.S)
for field_1 in field:
pic_links = re.findall('src="(.*?)" height="124"', field_1, re.S)
if len(pic_links) != 0:
container.append(pic_links[0])
# 需要事先在父目录中建立好文件夹"pic from imooc"
num = 1
for pic_link in container:
print 'Downloading...', pic_link
pic = requests.get(pic_link)
save = open('pic from imooc\\' + str(num) + '.jpg', 'wb')
save.write(pic.content)
save.close()
num += 1
##爬日语学习的标题
import requests
html = requests.get('http://tieba.baidu.com/f?ie=utf-8&kw=%E6%97%A5%E6%9C%AC')
#print html.text
# 通过Network获取headers
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
html = requests.get('http://jp.tingroom.com/rumen/ryrumen/', headers = headers)
html.encoding = 'utf-8'
#print html.text
field = re.findall('<li style=" font-size:14px;(.*?)</li>', html.text, re.S)
for item in field:
title = re.search('style="color: #039;">(.*?)</a>', item, re.S).group(1)
sub_title = re.search('style="color:#666666;">(.*?)</span>', item, re.S).group(1)
print format(title, '40'), '\t', sub_title
# 爬取极客学院课程信息
import requests, re
class spider(object):
def source(self, url):
html = requests.get(url)
return html.text
def pages(self, url, total_page):
now_page = int(re.search('pageNum=(\d+)', url, re.S).group(1))
page_group = []
for i in range(now_page, total_page + 1):
link = re.sub('pageNum=\d+', 'pageNum=%s' %i, url, re.S)
page_group.append(link)
return page_group
def get_class(self, source):
every_class = re.findall('style="height: 88px;">(.*?)</div>', source, re.S)
return every_class
def getinfo(self, eachclass):
info = {}
info['title'] = re.findall('>(.*?)</a></h2>', eachclass, re.S)[0].split('>')[1]
info['intro'] = re.search('display: none;">(.*?)</p>', eachclass, re.S).group(1)
info['people'] = re.search('<em class="learn-number">(.*?)</em>', eachclass, re.S).group(1).strip(" ")
detail_field = re.findall('<em>(.*?)</em>', eachclass, re.S)
info['time'] = detail_field[0].strip(" ")
info['level'] = detail_field[1].strip(" ")
return info
def saveinfo(self, classinfo):
f = open('jikexueyuan.txt', 'a')
for each in classinfo:
f.writelines('title: ' + each['title'].encode('utf-8') + '\n')
f.writelines('content: ' + each['intro'].strip('\n').strip(' ').encode('utf-8') + '\n')
f.writelines('learn_num: ' + each['people'].encode('utf-8') + '\n')
f.writelines('classtime: ' + each['time'].strip('\n').encode('utf-8') + '\n')
f.writelines('classlevel: ' + each['level'].encode('utf-8') + '\n\n')
f.close()
if __name__ == "__main__":
classinfo = []
url = "http://www.jikexueyuan.com/course/?pageNum=1"
spider_new = spider()
pages = int(raw_input('How many pages do you want? '))
all_links = spider_new.pages(url, pages)
for link in all_links:
print '正在爬取……' + link
html = spider_new.source(link)
everyclass = spider_new.get_class(html)
for each in everyclass:
info = spider_new.getinfo(each)
classinfo.append(info)
spider_new.saveinfo(classinfo)
from bs4 import BeautifulSoup
f = open('test.html', 'r')
content = f.read()
soup = BeautifulSoup(content, 'html.parser')
print soup
print soup.prettify() #输出格式美化
#select方法:class对应点号;id对应井号
soup.select('.sister') #返回class为sister所组成的列表
soup.select('#link1') #返回id为link1所组成的列表
soup.select('#link1')[0]['href'] # 提取链接 ['href']
soup.select('#link1')[0].text # 提取文字 .text
soup('p')
soup.find_all('p') #两者等价
for text in [item.text for item in soup('p')]: print text
soup('head')
soup('body')
soup('title')
soup.title.text
soup.title.name #打印属性名称
for tag in soup.find_all(True):
print tag.name
# 只返回第一次遇见的值
soup.p
soup.find_all('p') #返回多次所遇见的值
soup.p['class']
soup.find_all('p', {"class": "story"}) #返回多次p开头,class为story
soup.find_all('p', {'class': 'story'})[0].find_all('a')
soup.a
soup.find_all('a', {'id': 'link3'})
soup.find(id = 'link3')
soup.find_all(['a', 'b']) #返回开头为a或者b的列表
soup.get_text().split('\n')
# 微信公众号信息爬取
url = 'http://mp.weixin.qq.com/s?__biz=MzIxNTQ4NzAwNA==&mid=2247484008&idx=1&sn=dfa8a4a371dfbf4c2aa33574b2e99a25&scene=1&\
srcid=0824cvxNVK6X3pDWyK08Byhj#rd'
content = requests.get(url)
soup = BeautifulSoup(content.text, 'html.parser')
print soup.find('h2', {'class': 'rich_media_title'}).text.strip()
print soup.find('div', {'class': 'rich_media_meta_list'}).text.strip().replace('\n', ' ')
print soup.find('em').text
print soup.find('div', {'class': 'rich_media_content'}).text
# 基本信息
field = soup.find('div', {'class': 'rich_media_meta_list'})
title = soup.find('h2', {'class': 'rich_media_title'}).get_text().strip()
date = field.find(id = 'post-date').get_text()
content = soup.find('div', {'class': 'rich_media_content'}).get_text()
print title
print date
print content
from lxml import etree
html = open('demo_1.html', 'r').read()
selector = etree.HTML(html)
content = selector.xpath('/ /ul[@id="good"]/li/text()')
for i in content:
print i
content_1 = selector.xpath('//ul/li/text()')
for i in content_1:
print i
link = selector.xpath('//a/@href')
for i in link:
print i
from lxml import etree
html = open('demo_2.html').read()
print html
selector = etree.HTML(html)
content_1 = selector.xpath('//body/div[1]/text()')
for content in content_1: print content
attr_2 = selector.xpath('//body/div[2]/@id')
for attr in attr_2: print attr
content = selector.xpath('//div[starts-with(@id, "test")]/text()')
for item in content: print item
attrs = selector.xpath('//div[starts-with(@id, "test")]/@id')
for attr in attrs: print attr
html = open('demo_3.html', 'r').read()
print html
selector = etree.HTML(html)
content = selector.xpath('//div[starts-with(@id, "test")]/text()')
for i in content: print i
selector = etree.HTML(html)
content = selector.xpath('//span[starts-with(@id, "test")]/text()')
for i in content: print i
field = selector.xpath('//div[@id="test"]')[0]
content = field.xpath('string(.)')
print content.replace('\n', '')
# 爬取百度贴吧中“Python吧”前50页的内容
# coding:utf-8
import requests
from lxml import etree
url = 'http://tieba.baidu.com/f?kw=python&ie=utf-8&pn='
lst = []
url_1 = 'http://tieba.baidu.com'
page = int(raw_input(u'输入50的倍数:'))
for i in range(0, page, 50):
lst.append(url + str(i))
with open('level_1.txt', 'a') as f:
for item in lst:
link, time = ' ', ' '
print u'正在抓取...' + str(item)
html_1 = requests.get(item)
selector_1 = etree.HTML(html_1.text)
field = selector_1.xpath('//div[@class="t_con cleafix"]')
for each in field:
title = each.xpath('div[@class="col2_right j_threadlist_li_right "]/div')[0].xpath('string(.)').strip().split('\n')[0].strip()
reply_num = each.xpath('div[@class="col2_left j_threadlist_li_left"]/span/text()')[0]
author = each.xpath('div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span/@title')[0].split(':')[1]
try:
time = each.xpath('div[@class="col2_right j_threadlist_li_right "]/div')[0].xpath('string(.)').strip().split('\n')[3].strip('\n')
except Exception, e:
print e
time = ' '
if len(each.xpath('div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/@href')) != 0:
link = url_1 + each.xpath('div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/@href')[0]
elif len(each.xpath('div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit member_thread_title_frs "]/a/@href')) != 0:
link = url_1 + each.xpath('div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit member_thread_title_frs "]/a/@href')[0]
f.write(title.encode('utf-8') + '\t' + str(reply_num) + '\t' + str(link) + '\t' + author.encode('utf-8') + '\t' + time.encode('utf-8'))
f.write('\n')
# 作为抓取提示标识
print time, title