@twein89
2017-12-26T09:22:14.000000Z
字数 3973
阅读 1130
爬虫
从浏览器或者charles分析出来的请求链接,
post或者get请求,可以先复制成curl请求,
然后可以去https://curl.trillworks.com/#python
转换成python的requests代码。
例如以下POST的curl:
curl -H 'Charset: UTF-8' -H 'User-Agent: Dalvik/2.1.0 (Linux; U; Android 7.1.1; MI 6 MIUI/V9.0.6.0.NCACNEI);(cmblife 6.0.5/70)' -H 'Accept: */*' -H 'Accept-Language: zh-CN' -H 'cookie: route=e6f637f91bc1ad8ba5fe5d1bf7b876fa' -H 'Host: piao.o2o.cmbchina.com' --data "body={districtId='', labelId='02', cityNo='571', longitude='120.214761', parmName='DEFAULT', signOfOrder='0', regionId='', dimension='30.254312', merTypeId2=''}&syshead={trans_code='SI_PRD0020', chnlUserId='beec5eadf6a240599a2f483c0207cdae', sessionId='f9c745fbabc84feabd339be17f37cae3', chnlId='01', pageIndex=1, pageSize=10}&p0=a&p1=70&p2=xiaomi&p3=f6be0115b6a946fea7e5eb51db31361f2&p4=939a35784f814ea6b94b26635c238c74&p5=beec5eadf6a240599a2f483c0207cdae&p6=540886488&p7=f93b7b5fa20145a6a09c8b1be8f60997&p8=8dc2b5be8bca440280ca66e857811c2e&p9=f9c745fbabc84feabd339be17f37cae3&p10=0f8a1851fac94c6491d28115324970ad&groupFlag=0" --compressed 'https://piao.o2o.cmbchina.com/yummy-portal/JSONServer/execute.do'
转换后:
import requests
headers = {
'Charset': 'UTF-8',
'User-Agent': 'Dalvik/2.1.0 (Linux; U; Android 7.1.1; MI 6 MIUI/V9.0.6.0.NCACNEI);(cmblife 6.0.5/70)',
'Accept': '*/*',
'Accept-Language': 'zh-CN',
'cookie': 'route=e6f637f91bc1ad8ba5fe5d1bf7b876fa',
'Host': 'piao.o2o.cmbchina.com',
}
data = [
('body', '{districtId=\'\', labelId=\'02\', cityNo=\'571\', longitude=\'120.214761\', parmName=\'DEFAULT\', signOfOrder=\'0\', regionId=\'\', dimension=\'30.254312\', merTypeId2=\'\'}'),
('syshead', '{trans_code=\'SI_PRD0020\', chnlUserId=\'beec5eadf6a240599a2f483c0207cdae\', sessionId=\'f9c745fbabc84feabd339be17f37cae3\', chnlId=\'01\', pageIndex=1, pageSize=10}'),
('p0', 'a'),
('p1', '70'),
('p2', 'xiaomi'),
('p3', 'f6be0115b6a946fea7e5eb51db31361f2'),
('p4', '939a35784f814ea6b94b26635c238c74'),
('p5', 'beec5eadf6a240599a2f483c0207cdae'),
('p6', '540886488'),
('p7', 'f93b7b5fa20145a6a09c8b1be8f60997'),
('p8', '8dc2b5be8bca440280ca66e857811c2e'),
('p9', 'f9c745fbabc84feabd339be17f37cae3'),
('p10', '0f8a1851fac94c6491d28115324970ad'),
('groupFlag', '0'),
]
response = requests.post('https://piao.o2o.cmbchina.com/yummy-portal/JSONServer/execute.do', headers=headers, data=data)
转换后的headers和data就可以直接拿来用了。
from pyspider.libs.base_handler import *
class Handler(BaseHandler):
def on_start(self):
url = 'https://piao.o2o.cmbchina.com/yummy-portal/JSONServer/execute.do'
headers = {
'Charset': 'UTF-8',
'User-Agent': 'Dalvik/2.1.0 (Linux; U; Android 7.1.1; MI 6 MIUI/V9.0.6.0.NCACNEI);(cmblife 6.0.5/70)',
'Accept': '*/*',
'Accept-Language': 'zh-CN',
'cookie': 'route=e6f637f91bc1ad8ba5fe5d1bf7b876fa',
'Host': 'piao.o2o.cmbchina.com',
}
data = [
('body', '{districtId=\'\', labelId=\'02\', cityNo=\'571\', longitude=\'120.214761\', parmName=\'DEFAULT\', signOfOrder=\'0\', regionId=\'\', dimension=\'30.254312\', merTypeId2=\'\'}'),
('syshead', '{trans_code=\'SI_PRD0020\', chnlUserId=\'beec5eadf6a240599a2f483c0207cdae\', sessionId=\'f9c745fbabc84feabd339be17f37cae3\', chnlId=\'01\', pageIndex=1, pageSize=10}'),
('p0', 'a'),
('p1', '70'),
('p2', 'xiaomi'),
('p3', 'f6be0115b6a946fea7e5eb51db31361f2'),
('p4', '939a35784f814ea6b94b26635c238c74'),
('p5', 'beec5eadf6a240599a2f483c0207cdae'),
('p6', '540886488'),
('p7', 'f93b7b5fa20145a6a09c8b1be8f60997'),
('p8', '8dc2b5be8bca440280ca66e857811c2e'),
('p9', 'f9c745fbabc84feabd339be17f37cae3'),
('p10', '0f8a1851fac94c6491d28115324970ad'),
('groupFlag', '0'),
]
# 发送POST请求的例子
self.crawl(url,
callback=self.json_parser,
method='POST',
headers=headers,
data=data)
def json_parser(self, response):
rows = response.json['body']['rows']
for item in rows:
# 框架方法:构造一个链接, 用来处理需要return多个items的情况。
# 发送相同的url会被去重, 所以需要加入每个item的id(在这里是productNo)作区分
data_url = 'data:,coupon_item#{}'.format(item['productNo'])
self.crawl(data_url,
callback=self.multi_item_parser,
save={'item': item})
def multi_item_parser(self, response):
coupon_item = response.save['item']
return {'coupon_item': coupon_item}
def on_result(self, result):
if not result:
return
if result.get('coupon_item', None):
pprint(result['coupon_item'])