[关闭]
@twein89 2017-12-26T17:22:14.000000Z 字数 3973 阅读 1068

pyspider example2

爬虫


pyspider请求POST方法及返回多个items的例子


简便的转换

从浏览器或者charles分析出来的请求链接,
post或者get请求,可以先复制成curl请求,
然后可以去https://curl.trillworks.com/#python
转换成python的requests代码。
例如以下POST的curl:

  1. curl -H 'Charset: UTF-8' -H 'User-Agent: Dalvik/2.1.0 (Linux; U; Android 7.1.1; MI 6 MIUI/V9.0.6.0.NCACNEI);(cmblife 6.0.5/70)' -H 'Accept: */*' -H 'Accept-Language: zh-CN' -H 'cookie: route=e6f637f91bc1ad8ba5fe5d1bf7b876fa' -H 'Host: piao.o2o.cmbchina.com' --data "body={districtId='', labelId='02', cityNo='571', longitude='120.214761', parmName='DEFAULT', signOfOrder='0', regionId='', dimension='30.254312', merTypeId2=''}&syshead={trans_code='SI_PRD0020', chnlUserId='beec5eadf6a240599a2f483c0207cdae', sessionId='f9c745fbabc84feabd339be17f37cae3', chnlId='01', pageIndex=1, pageSize=10}&p0=a&p1=70&p2=xiaomi&p3=f6be0115b6a946fea7e5eb51db31361f2&p4=939a35784f814ea6b94b26635c238c74&p5=beec5eadf6a240599a2f483c0207cdae&p6=540886488&p7=f93b7b5fa20145a6a09c8b1be8f60997&p8=8dc2b5be8bca440280ca66e857811c2e&p9=f9c745fbabc84feabd339be17f37cae3&p10=0f8a1851fac94c6491d28115324970ad&groupFlag=0" --compressed 'https://piao.o2o.cmbchina.com/yummy-portal/JSONServer/execute.do'

转换后:

  1. import requests
  2. headers = {
  3. 'Charset': 'UTF-8',
  4. 'User-Agent': 'Dalvik/2.1.0 (Linux; U; Android 7.1.1; MI 6 MIUI/V9.0.6.0.NCACNEI);(cmblife 6.0.5/70)',
  5. 'Accept': '*/*',
  6. 'Accept-Language': 'zh-CN',
  7. 'cookie': 'route=e6f637f91bc1ad8ba5fe5d1bf7b876fa',
  8. 'Host': 'piao.o2o.cmbchina.com',
  9. }
  10. data = [
  11. ('body', '{districtId=\'\', labelId=\'02\', cityNo=\'571\', longitude=\'120.214761\', parmName=\'DEFAULT\', signOfOrder=\'0\', regionId=\'\', dimension=\'30.254312\', merTypeId2=\'\'}'),
  12. ('syshead', '{trans_code=\'SI_PRD0020\', chnlUserId=\'beec5eadf6a240599a2f483c0207cdae\', sessionId=\'f9c745fbabc84feabd339be17f37cae3\', chnlId=\'01\', pageIndex=1, pageSize=10}'),
  13. ('p0', 'a'),
  14. ('p1', '70'),
  15. ('p2', 'xiaomi'),
  16. ('p3', 'f6be0115b6a946fea7e5eb51db31361f2'),
  17. ('p4', '939a35784f814ea6b94b26635c238c74'),
  18. ('p5', 'beec5eadf6a240599a2f483c0207cdae'),
  19. ('p6', '540886488'),
  20. ('p7', 'f93b7b5fa20145a6a09c8b1be8f60997'),
  21. ('p8', '8dc2b5be8bca440280ca66e857811c2e'),
  22. ('p9', 'f9c745fbabc84feabd339be17f37cae3'),
  23. ('p10', '0f8a1851fac94c6491d28115324970ad'),
  24. ('groupFlag', '0'),
  25. ]
  26. response = requests.post('https://piao.o2o.cmbchina.com/yummy-portal/JSONServer/execute.do', headers=headers, data=data)

转换后的headers和data就可以直接拿来用了。

POST方法及return多个item

  1. from pyspider.libs.base_handler import *
  2. class Handler(BaseHandler):
  3. def on_start(self):
  4. url = 'https://piao.o2o.cmbchina.com/yummy-portal/JSONServer/execute.do'
  5. headers = {
  6. 'Charset': 'UTF-8',
  7. 'User-Agent': 'Dalvik/2.1.0 (Linux; U; Android 7.1.1; MI 6 MIUI/V9.0.6.0.NCACNEI);(cmblife 6.0.5/70)',
  8. 'Accept': '*/*',
  9. 'Accept-Language': 'zh-CN',
  10. 'cookie': 'route=e6f637f91bc1ad8ba5fe5d1bf7b876fa',
  11. 'Host': 'piao.o2o.cmbchina.com',
  12. }
  13. data = [
  14. ('body', '{districtId=\'\', labelId=\'02\', cityNo=\'571\', longitude=\'120.214761\', parmName=\'DEFAULT\', signOfOrder=\'0\', regionId=\'\', dimension=\'30.254312\', merTypeId2=\'\'}'),
  15. ('syshead', '{trans_code=\'SI_PRD0020\', chnlUserId=\'beec5eadf6a240599a2f483c0207cdae\', sessionId=\'f9c745fbabc84feabd339be17f37cae3\', chnlId=\'01\', pageIndex=1, pageSize=10}'),
  16. ('p0', 'a'),
  17. ('p1', '70'),
  18. ('p2', 'xiaomi'),
  19. ('p3', 'f6be0115b6a946fea7e5eb51db31361f2'),
  20. ('p4', '939a35784f814ea6b94b26635c238c74'),
  21. ('p5', 'beec5eadf6a240599a2f483c0207cdae'),
  22. ('p6', '540886488'),
  23. ('p7', 'f93b7b5fa20145a6a09c8b1be8f60997'),
  24. ('p8', '8dc2b5be8bca440280ca66e857811c2e'),
  25. ('p9', 'f9c745fbabc84feabd339be17f37cae3'),
  26. ('p10', '0f8a1851fac94c6491d28115324970ad'),
  27. ('groupFlag', '0'),
  28. ]
  29. # 发送POST请求的例子
  30. self.crawl(url,
  31. callback=self.json_parser,
  32. method='POST',
  33. headers=headers,
  34. data=data)
  35. def json_parser(self, response):
  36. rows = response.json['body']['rows']
  37. for item in rows:
  38. # 框架方法:构造一个链接, 用来处理需要return多个items的情况。
  39. # 发送相同的url会被去重, 所以需要加入每个item的id(在这里是productNo)作区分
  40. data_url = 'data:,coupon_item#{}'.format(item['productNo'])
  41. self.crawl(data_url,
  42. callback=self.multi_item_parser,
  43. save={'item': item})
  44. def multi_item_parser(self, response):
  45. coupon_item = response.save['item']
  46. return {'coupon_item': coupon_item}
  47. def on_result(self, result):
  48. if not result:
  49. return
  50. if result.get('coupon_item', None):
  51. pprint(result['coupon_item'])

添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注