@tenlee 2015-08-04T08:27:35.000000Z 字数 1929 阅读 1619

Python小爬虫

Python

廖雪峰的python教程，最近刚出的Python3教程，很不错。（戳我学廖雪峰的Python3）但是由于学校12点就断网，所以就需要一个离线的。
代码

#!/usr/bin/env python3
#coding:utf-8
import sys, re, fileinput
from urllib import request
def getAddress():
    address = "http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000"
    ftarge = "22.txt"
    fsource = "23.txt"
    i = 0
    with request.urlopen(address) as f:
        sdata = f.read()
        data = sdata.decode("utf-8")
        with open(fsource, 'w') as fs:
            fs.write(data)
    with open(fsource, 'r') as fs:
        for line in fs.readlines():
            i = i + 1
            if (i < 366 or i > 849):
                continue
            temp = re.search(r'/wiki.+00', line, re.S)
            if temp:
                with open(ftarge, 'a') as ft:
                    ft.write(temp.group(0) + '\n')
getAddress()
fileaddress = input("please input the save path(absolute address) ")
num = 1
addressfile = "22.txt"
for line in fileinput.input(addressfile):
    line = "http://www.liaoxuefeng.com" + line.strip('\n') + '/'
    with request.urlopen(line) as f:
        data = f.read()
    temp = data.decode('utf-8');
    mainbody = re.search(r'<h4>.+<div class="x-anchor"><a name="comments"></a></div>', temp, re.S)
    ptitle = re.search(r'<title>(.+)</title>', temp)
    title = ptitle.group(1)
    title = title.replace(" ", "")
    title = title.replace("/", "")
    filename = fileaddress + str(num) + '_' + title +'.html'
    num = num + 1;
    print (filename + " is writing...")
    head = '<meta charset="UTF-8">'
    with open(filename, 'w') as f:
        f.write(head + '\n' + mainbody.group(0))

我先从一个网页中获取Python教程每篇的相对地址，正则表达式匹配出来保存在22.txt文件，之后在从该文件提取网址，从而获得所有文章，再用正则表达式匹配出来文章的主题并保存成相应html文件。
需要你自己手动输入你要保存网页的绝对地址。
下面一个是Python爬区12306验证码的代码。

#!/usr/bin/env python3
#coding:utf-8
import time
import requests
def download_file(url,local_filename):
    r = requests.get(url, stream=True,verify = False)
    with open(local_filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
                f.flush()
    return local_filename
savePath = input("please input the save path(absolute address)")
url = 'https://kyfw.12306.cn/otn/passcodeNew/getPassCodeNew?module=login&rand=sjrand&0.7923694306518883'
for i in range(0x7fffffff):
    print (download_file(url, savePath + str(i+1) +'.jpg'))
    time.sleep(1)     #休眠一秒

Python小爬虫

内容目录