[关闭]
@tenlee 2015-08-04T08:27:35.000000Z 字数 1929 阅读 1619

Python小爬虫

Python


廖雪峰的python教程,最近刚出的Python3教程,很不错。(戳我学廖雪峰的Python3)但是由于学校12点就断网,所以就需要一个离线的。
代码

  1. #!/usr/bin/env python3
  2. #coding:utf-8
  3. import sys, re, fileinput
  4. from urllib import request
  5. def getAddress():
  6. address = "http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000"
  7. ftarge = "22.txt"
  8. fsource = "23.txt"
  9. i = 0
  10. with request.urlopen(address) as f:
  11. sdata = f.read()
  12. data = sdata.decode("utf-8")
  13. with open(fsource, 'w') as fs:
  14. fs.write(data)
  15. with open(fsource, 'r') as fs:
  16. for line in fs.readlines():
  17. i = i + 1
  18. if (i < 366 or i > 849):
  19. continue
  20. temp = re.search(r'/wiki.+00', line, re.S)
  21. if temp:
  22. with open(ftarge, 'a') as ft:
  23. ft.write(temp.group(0) + '\n')
  24. getAddress()
  25. fileaddress = input("please input the save path(absolute address) ")
  26. num = 1
  27. addressfile = "22.txt"
  28. for line in fileinput.input(addressfile):
  29. line = "http://www.liaoxuefeng.com" + line.strip('\n') + '/'
  30. with request.urlopen(line) as f:
  31. data = f.read()
  32. temp = data.decode('utf-8');
  33. mainbody = re.search(r'<h4>.+<div class="x-anchor"><a name="comments"></a></div>', temp, re.S)
  34. ptitle = re.search(r'<title>(.+)</title>', temp)
  35. title = ptitle.group(1)
  36. title = title.replace(" ", "")
  37. title = title.replace("/", "")
  38. filename = fileaddress + str(num) + '_' + title +'.html'
  39. num = num + 1;
  40. print (filename + " is writing...")
  41. head = '<meta charset="UTF-8">'
  42. with open(filename, 'w') as f:
  43. f.write(head + '\n' + mainbody.group(0))

我先从一个网页中获取Python教程每篇的相对地址, 正则表达式匹配出来保存在22.txt文件,之后在从该文件提取网址,从而获得所有文章,再用正则表达式匹配出来文章的主题并保存成相应html文件。
需要你自己手动输入你要保存网页的绝对地址。
下面一个是Python爬区12306验证码的代码。

  1. #!/usr/bin/env python3
  2. #coding:utf-8
  3. import time
  4. import requests
  5. def download_file(url,local_filename):
  6. r = requests.get(url, stream=True,verify = False)
  7. with open(local_filename, 'wb') as f:
  8. for chunk in r.iter_content(chunk_size=1024):
  9. if chunk: # filter out keep-alive new chunks
  10. f.write(chunk)
  11. f.flush()
  12. return local_filename
  13. savePath = input("please input the save path(absolute address)")
  14. url = 'https://kyfw.12306.cn/otn/passcodeNew/getPassCodeNew?module=login&rand=sjrand&0.7923694306518883'
  15. for i in range(0x7fffffff):
  16. print (download_file(url, savePath + str(i+1) +'.jpg'))
  17. time.sleep(1) #休眠一秒
添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注