需求
公司app有个模块,需要做手机号段检索,便于导入指定市的电话
参考第三方网站:http://m.jihaoba.com/tools/haoduan/
利用scrapy框架,爬取城市和号段,存入数据库,数据结构:
create table `fcxlt_fans_data`(
`id` bigint NOT NULL AUTO_INCREMENT,
`city_id` int(11) NOT NULL,
`segment_num_3` char(3) not null ,
`segment_num_7` mediumtext not null ,
primary key (`id`),
key `city_id` (`city_id`) using btree
) engine =innodb auto_increment=1 default charset=utf8mb4
phones.py
# -*- coding: utf-8 -*-
import scrapy
from phones.items import PhonesItem
import re
class PhonesSpider(scrapy.Spider):
name = 'phones'
allowed_domains = ['m.jihaoba.com']
start_urls = ['http://m.jihaoba.com/tools/haoduan/']
def parse(self, response):
#city_list = response.xpath("//ul[@class='city_lst']/li")#/html/body/div[8]/ul[21]
for c in city_list:
phones_item = PhonesItem()
s = c.xpath(".//a/text()").extract()[0]
phones_item['cname'] = s.strip()
phref = c.xpath(".//a//@href").extract()
curl = "http://m.jihaoba.com"+phref[0]
yield scrapy.Request(url=curl,meta={'phones_item':phones_item,'curl':curl}, callback=self.gettitle,dont_filter=True)#爬取城市详情
def gettitle(self,response):
phones_item = response.meta['phones_item']
curl = response.meta['curl']
hd = response.xpath("//ul[@class='city_lst']/a")
# hd = response.xpath("/html/body/div[7]/ul[1]/a")
for i in hd:
te = i.xpath(".//font/text()").extract()
u = "http://m.jihaoba.com"+i.xpath(".//@href").extract()[0]
dh_3 = re.findall('\d+',te[0])
yield scrapy.Request(url=u,meta={'phones_item':phones_item,'url':u,'dh_3':dh_3[0],'curl':curl},callback=self.gettels,dont_filter=True)#爬取三位段号详情, dont_filter=False
def gettels(self,response):
phones_item = response.meta['phones_item']
phones_item['dh_url']=response.meta['url']
phones_item['city_url']=response.meta['curl']
phones_item['dh_3'] = response.meta['dh_3']
p = response.xpath(".//li[@class='city-hd01']/a/text()").extract()
s=''
for i in p:
s = s + "," +i
phones_item['dh_7']=s[1:]
yield phones_item
存入数据库的pipelines
import json
import pymysql
import time
class PhonesPipeline(object):
def __init__(self):
# connection database
self.connect = pymysql.connect('localhost','root','root','appjjr',use_unicode=True,charset='utf8')
# get cursor
self.cursor = self.connect.cursor()
print("connecting mysql success!")
def process_item(self, item, spider):
item = dict(item)
if(len(item['dh_7']) >0):
sql = """SELECT * FROM cofcxlt_areas WHERE name LIKE %s"""
city_name = item['cname']+"%"
sql = """select id from fcxlt_areas where name like %s and level =%s"""
self.cursor.execute(sql, (city_name,2))
res = self.cursor.fetchone()
if(res[0]>0):
sqlstr = "insert into fcxlt_fans_data(city_name,segment_num_3,segment_num_7) VALUES('%s','%s','%s')"%(res[0],item['dh_3'],item['dh_7'])
self.cursor.execute(sqlstr)
self.connect.commit()
time.sleep(0.5)
return item
存入json文档
def __init__(self):
self.file = open('fenghua.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
str_data = json.dumps(dict(item), ensure_ascii=False) + ',\n'
self.file.write(str_data)
return item
def __del__(self):
self.file.close()