版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/ZHH_Love123/article/details/84863208
一、post请求处理 (pyspider默认url去重)
def index_page(self, response):
detail_url = 'http://news.cqcoal.com/manage/newsaction.do?method:getNewsAddonarticle'
dict_list = response.json['rows']
for aid in dict_list:
print aid['id']
url = '%s?id=%s' % (detail_url,'')
data= {
'source_channel' : response.save['source_channel'],
'source_name' : response.save['source_name'],
'source_typeide' : response.save['typeid'],
'pub_time': aid['pubdate'],
'title': aid['title'],
'editor': aid['fshuser'],
'source': aid['source'],
'content': aid['description']
}
timestampStr = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
fakeItagForceRecrawl = "%s" % timestampStr
url = detail_url + "#" + timestampStr
self.crawl(url,itag=fakeItagForceRecrawl,data={'id':aid['id']}, headers=headers,callback=self.detail_page, save=data, method='POST')
二、index_page 接收的response解析到的数据直接保存
class Handler(BaseHandler):
class Handler(BaseHandler):
crawl_config = {
'itag': 'v11'
}
@every(minutes=24 * 60)
def on_start(self):
self.crawl('http://www.cnblogs.com/adc8868/p/7814893.html', callback=self.index_page)
@config(age=0)
def index_page(self, response):
# 将本页地址交给详情页抓取处理
self.crawl(response.url, callback=self.detail_page)
@config(fetch_type="js")
def detail_page(self, response):
# 获取到上一页,下一页的链接,并进行抓取
for prev in response.doc("#post_next_prev a").items():
self.crawl(prev.attr.href, callback=self.index_page)
return {
"url": response.url,
"title": response.doc('title').text(),
}
三、传变量和参数
index_page 接收的response 解析的数据如何传到 detail_page中 用save = json保存 然后再传到下边