# -*- coding: utf-8 -*- import scrapy import json import re from scrapy_project.items import WeiboItem class WeiboSpider(scrapy.Spider): name = 'weibo' allowed_domains = ['m.weibo.cn'] start_urls = ['https://m.weibo.cn/'] def parse(self, response): base_url = 'https://m.weibo.cn/api/container/getIndex?containerid=102803&openApp=0&page=%s' for i in range(0,31): url = base_url % i yield scrapy.Request(url,callback=self.parse_detail) def parse_detail(self,response): res_dict = json.loads(response.text) # print(res_dict) home_list = res_dict['data']['cards'] for v in home_list: # print(v) try: ss = v['mblog']['user']['id'] urls = 'https://m.weibo.cn/profile/info?uid={}'.format(ss) yield scrapy.Request(urls, callback=self.parse_list) except: continue def parse_list(self,response): aa = json.loads(response.text) # print(aa) ss = aa['data']['more'] # print(ss) vv = 'https://m.weibo.cn/api/container/getIndex?containerid=' sc = re.findall('p/(.*)', ss) sc = ''.join(sc) # print(sc) for i in range(1,21): qq = '&page_type=03&page={}'.format(i) sv = vv + sc + qq print(sv) yield scrapy.Request(sv, callback=self.parse_info) def parse_info(self,response): aws = json.loads(response.text) ccc = aws['data']['cards'] for ddd in ccc: try: mingzi = ddd['mblog']['user']['screen_name'] # print(mingzi) shijian = ddd['mblog']['created_at'] # print(shijian) neirong = ddd['mblog']['text'] # print(neirong) zhuanfa = ddd['mblog']['reposts_count'] # print(zhuanfa) pinglun = ddd['mblog']['comments_count'] # print(pinglun) dianzan = ddd['mblog']['attitudes_count'] # print(dianzan) # print('--'*50) item = WeiboItem() item['mingzi'] = mingzi item['shijian'] = shijian item['neirong'] = neirong item['zhuanfa'] = zhuanfa item['pinglun'] = pinglun item['dianzan'] = dianzan yield item except: continue
import scrapy class ScrapyProjectItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pass class WeiboItem(scrapy.Item): mingzi = scrapy.Field() shijian = scrapy.Field() neirong = scrapy.Field() zhuanfa = scrapy.Field() pinglun = scrapy.Field() dianzan = scrapy.Field() def get_insert_sql(self): sql = 'INSERT INTO weibo(mingzi ,shijian ,neirong ,zhuanfa ,pinglun,dianzan ) ' \ 'VALUES (%s,%s,%s,%s,%s,%s)' data = (self['mingzi'],self['shijian'],self['neirong'],self['zhuanfa'],self['pinglun'],self['dianzan']) return (sql, data)