@hainingwyx 2018-04-26T08:37:05.000000Z 字数 8674 阅读 1891

爬虫基本内容

Python 爬虫

正则表达式

一个用来按照指定规则匹配文本的语法。
常用符号：点号，星号，问号，括号
- 点号：匹配任意字符，换行符除外
- 星号：匹配前一个字符0次或者无限次
- 问号：匹配前一个字符0次或者1次
- .* 贪心算法
- .*? 非贪心算法

findall

描述：匹配所有符合规律的内容，返回包含结果的列表

表达式	功能
re.findall('x.', code)	返回两个长度的字符串组成的列表
re.findall('x*', code)	返回字符'x'的匹配列表，不匹配为空字符串
re.findall('xx.*x', code)	返回以xx开始和以x为结束的最大字符串组成的列表
re.findall('xx.*?xx', code)	返回以xx开始和以xx为结束的所有字符串组成的列表
re.findall('xx(.*?)xx', code)	返回所有xx开始和xx结束中间的所有内容为字符串组成的列表，换行重新开始
re.findall('xx(.*?)xx', code, re.S)	返回所有xx开始和xx结束中间的所有内容为字符串组成的列表，换行不影响
re.findall('xx(.?)xx(.?)xx', code)	返回满足该格式的两个字符串组成的元祖列表
re.findall('(\d+)', code)	返回所有纯数字组成的字符串列表

search

描述：search 匹配并提取第一个符合规律的内容，返回一个正则表达式对象

表达式	功能
re.search('xx(.?)xx(.?)xx', code)	返回对象
re.search('xx(.?)xx(.?)xx', code).group(1)	返回符合该格式的第1个字符串，2则为第二个字符串

sub

描述：sub 替换符合规律的内容，返回替换后的值

表达式	功能
re.sub('xx(.*?)xx', 'bilibili', code)	引号内的内容用新字符串
n = re.sub('xx(.*?)xx', 'bili%d'%123, code)	引号内的内容用新字符串+数字代替

# 只有一个title的情况下
title = re.search('<title>(.*?)</title>', html, re.S).group(1)
# 爬链接
links = re.findall("<a href = '(.*?)'>", html, re.S)

实战

# 爬慕课网上前端开发课程中关于JS的所有课程封面图
import re, requests
link = "http://www.imooc.com/course/list?c=javascript&page=1"
container = []
for i in range(1, 4):
    new_link = re.sub('page=\d+', 'page=%d'%i, link)
    html = requests.get(new_link)
    field = re.findall('<div class="moco-course-wrap">(.*?)</div>', html.text, re.S)
    for field_1 in field:
        pic_links = re.findall('src="(.*?)"  height="124"', field_1, re.S)
        if len(pic_links) != 0:
            container.append(pic_links[0])
# 需要事先在父目录中建立好文件夹"pic from imooc"
num = 1 
for pic_link in container:
    print 'Downloading...', pic_link
    pic = requests.get(pic_link)
    save = open('pic from imooc\\' + str(num) + '.jpg', 'wb')
    save.write(pic.content)
    save.close()
    num += 1

##爬日语学习的标题
import requests
html = requests.get('http://tieba.baidu.com/f?ie=utf-8&kw=%E6%97%A5%E6%9C%AC')
#print html.text
# 通过Network获取headers
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
html = requests.get('http://jp.tingroom.com/rumen/ryrumen/', headers = headers)
html.encoding = 'utf-8'
#print html.text
field = re.findall('<li style=" font-size:14px;(.*?)</li>', html.text, re.S)
for item in field:
    title = re.search('style="color: #039;">(.*?)</a>', item, re.S).group(1)
    sub_title = re.search('style="color:#666666;">(.*?)</span>', item, re.S).group(1)
    print format(title, '40'), '\t', sub_title

# 爬取极客学院课程信息
import requests, re
class spider(object):
    def source(self, url):
        html = requests.get(url)
        return html.text
    def pages(self, url, total_page):
        now_page = int(re.search('pageNum=(\d+)', url, re.S).group(1))
        page_group = []
        for i in range(now_page, total_page + 1):
            link = re.sub('pageNum=\d+', 'pageNum=%s' %i, url, re.S)
            page_group.append(link)
        return page_group
    def get_class(self, source):
        every_class = re.findall('style="height: 88px;">(.*?)</div>', source, re.S)
        return every_class
    def getinfo(self, eachclass):
        info = {}
        info['title'] = re.findall('>(.*?)</a></h2>', eachclass, re.S)[0].split('>')[1]
        info['intro'] = re.search('display: none;">(.*?)</p>', eachclass, re.S).group(1)
        info['people'] = re.search('<em class="learn-number">(.*?)</em>', eachclass, re.S).group(1).strip(" ")
        detail_field = re.findall('<em>(.*?)</em>', eachclass, re.S)
        info['time'] = detail_field[0].strip(" ")
        info['level'] = detail_field[1].strip(" ")
        return info
    def saveinfo(self, classinfo):
        f = open('jikexueyuan.txt', 'a')
        for each in classinfo:
            f.writelines('title: ' + each['title'].encode('utf-8') + '\n')
            f.writelines('content: ' + each['intro'].strip('\n').strip('            ').encode('utf-8') + '\n')
            f.writelines('learn_num: ' + each['people'].encode('utf-8') + '\n')
            f.writelines('classtime: ' + each['time'].strip('\n').encode('utf-8') + '\n')
            f.writelines('classlevel: ' + each['level'].encode('utf-8') + '\n\n')
        f.close()
if __name__ == "__main__":
    classinfo = []
    url = "http://www.jikexueyuan.com/course/?pageNum=1"
    spider_new = spider()
    pages = int(raw_input('How many pages do you want? '))
    all_links = spider_new.pages(url, pages)
    for link in all_links:
        print '正在爬取……' + link
        html = spider_new.source(link)
        everyclass = spider_new.get_class(html)
        for each in everyclass:
            info = spider_new.getinfo(each)
            classinfo.append(info)
    spider_new.saveinfo(classinfo)

Beautiful Soup

BeautifulSoup优势

方法简单，不需要太多代码
自动将进入文档转化为Unicode编码格式，将输出文档转化为UTF-8编码格式
BeautifulSoup是最高级的解析器，胜于lxml和html5lib
BeautifulSoup支持html.parser
- lxml是Python中的一个第三方解析器
- html5lib同上，它用浏览器的模式解析HTML

from bs4 import BeautifulSoup
f = open('test.html', 'r')
content = f.read()
soup = BeautifulSoup(content, 'html.parser')
print soup
print soup.prettify() #输出格式美化
#select方法：class对应点号；id对应井号
soup.select('.sister')      #返回class为sister所组成的列表
soup.select('#link1')       #返回id为link1所组成的列表
soup.select('#link1')[0]['href']        # 提取链接  ['href']
soup.select('#link1')[0].text           # 提取文字  .text
soup('p')
soup.find_all('p')      #两者等价
for text in [item.text for item in soup('p')]: print text
soup('head')
soup('body')
soup('title')
soup.title.text
soup.title.name     #打印属性名称
for tag in soup.find_all(True):
    print tag.name
# 只返回第一次遇见的值
soup.p
soup.find_all('p')      #返回多次所遇见的值
soup.p['class']
soup.find_all('p', {"class": "story"})      #返回多次p开头，class为story
soup.find_all('p', {'class': 'story'})[0].find_all('a')
soup.a
soup.find_all('a', {'id': 'link3'})
soup.find(id = 'link3')
soup.find_all(['a', 'b'])   #返回开头为a或者b的列表
soup.get_text().split('\n')

# 微信公众号信息爬取
url = 'http://mp.weixin.qq.com/s?__biz=MzIxNTQ4NzAwNA==&mid=2247484008&idx=1&sn=dfa8a4a371dfbf4c2aa33574b2e99a25&scene=1&\
srcid=0824cvxNVK6X3pDWyK08Byhj#rd'
content = requests.get(url)
soup = BeautifulSoup(content.text, 'html.parser')
print soup.find('h2', {'class': 'rich_media_title'}).text.strip()
print soup.find('div', {'class': 'rich_media_meta_list'}).text.strip().replace('\n', ' ')
print soup.find('em').text
print soup.find('div', {'class': 'rich_media_content'}).text
# 基本信息
field = soup.find('div', {'class': 'rich_media_meta_list'})
title = soup.find('h2', {'class': 'rich_media_title'}).get_text().strip()
date = field.find(id = 'post-date').get_text()
content = soup.find('div', {'class': 'rich_media_content'}).get_text()
print title
print date
print content

XPath

全称：XML Path Language，能够在网页树状结构中寻找内容
XPath支持HTML
XPath通过元素和属性进行导航
比正则表达式简单些

安装lxml：pip install lxml

from lxml import etree
selector = etree.HTML(source_code)

HTML

树状结构
逐层展开
逐层定位

XPath写法

手动分析
自动生成

from lxml import etree
html = open('demo_1.html', 'r').read()
selector = etree.HTML(html)
content = selector.xpath('/ /ul[@id="good"]/li/text()')
for i in content:
    print i
content_1 = selector.xpath('//ul/li/text()')
for i in content_1:
    print i
link = selector.xpath('//a/@href')
for i in link:
    print i

from lxml import etree
html = open('demo_2.html').read()
print html
selector = etree.HTML(html)
content_1 = selector.xpath('//body/div[1]/text()')
for content in content_1: print content
attr_2 = selector.xpath('//body/div[2]/@id')
for attr in attr_2: print attr
content = selector.xpath('//div[starts-with(@id, "test")]/text()')
for item in content: print item
attrs = selector.xpath('//div[starts-with(@id, "test")]/@id')
for attr in attrs: print attr
html = open('demo_3.html', 'r').read()
print html
selector = etree.HTML(html)
content = selector.xpath('//div[starts-with(@id, "test")]/text()')
for i in content: print i
selector = etree.HTML(html)
content = selector.xpath('//span[starts-with(@id, "test")]/text()')
for i in content: print i
field = selector.xpath('//div[@id="test"]')[0]
content = field.xpath('string(.)')
print content.replace('\n', '')

实战

# 爬取百度贴吧中“Python吧”前50页的内容
# coding:utf-8
import requests
from lxml import etree
url = 'http://tieba.baidu.com/f?kw=python&ie=utf-8&pn='
lst = []
url_1 = 'http://tieba.baidu.com'
page = int(raw_input(u'输入50的倍数：'))
for i in range(0, page, 50):
    lst.append(url + str(i))
with open('level_1.txt', 'a') as f:
    for item in lst:
        link, time = ' ', ' '
        print u'正在抓取...' + str(item)
        html_1 = requests.get(item)
        selector_1 = etree.HTML(html_1.text)
        field = selector_1.xpath('//div[@class="t_con cleafix"]')
        for each in field:
            title = each.xpath('div[@class="col2_right j_threadlist_li_right "]/div')[0].xpath('string(.)').strip().split('\n')[0].strip()
            reply_num = each.xpath('div[@class="col2_left j_threadlist_li_left"]/span/text()')[0]
            author = each.xpath('div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span/@title')[0].split(':')[1]
            try:
                time = each.xpath('div[@class="col2_right j_threadlist_li_right "]/div')[0].xpath('string(.)').strip().split('\n')[3].strip('\n')
            except Exception, e:
                print e
                time = ' '
            if len(each.xpath('div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/@href')) != 0:
                link = url_1 + each.xpath('div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/@href')[0]
            elif len(each.xpath('div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit  member_thread_title_frs "]/a/@href')) != 0:
                link = url_1 + each.xpath('div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit  member_thread_title_frs "]/a/@href')[0]
            f.write(title.encode('utf-8') + '\t' + str(reply_num) + '\t' + str(link) + '\t' + author.encode('utf-8') + '\t' + time.encode('utf-8'))
            f.write('\n')
            # 作为抓取提示标识
            print time, title

爬虫基本内容

正则表达式

findall

search

sub

实战

Beautiful Soup

BeautifulSoup优势

XPath

安装lxml：pip install lxml

HTML

XPath写法

实战

内容目录