1、channel_extracing.py
from bs4 import BeautifulSoup import requests start_url = 'http://bj.ganji.com/wu/' url_host = 'http://bj.ganji.com' def get_index_url(url): # url = start_url wb_data = requests.get(url) soup = BeautifulSoup(wb_data.text, 'lxml') links = soup.select('.fenlei > dt > a') for link in links: page_url = url_host + link.get('href') #print(page_url) #get_index_url(start_url) channel_list = ''' http://bj.ganji.com/jiaju/ http://bj.ganji.com/rirongbaihuo/ http://bj.ganji.com/shouji/ http://bj.ganji.com/bangong/ http://bj.ganji.com/nongyongpin/ http://bj.ganji.com/jiadian/ http://bj.ganji.com/ershoubijibendiannao/ http://bj.ganji.com/ruanjiantushu/ http://bj.ganji.com/yingyouyunfu/ http://bj.ganji.com/diannao/ http://bj.ganji.com/xianzhilipin/ http://bj.ganji.com/fushixiaobaxuemao/ http://bj.ganji.com/meironghuazhuang/ http://bj.ganji.com/shuma/ http://bj.ganji.com/laonianyongpin/ http://bj.ganji.com/xuniwupin/ http://bj.ganji.com/qitawupin/ http://bj.ganji.com/ershoufree/ http://bj.ganji.com/wupinjiaohuan/ ''' #get_index_url(start_url)
2、page_parsing.py
from bs4 import BeautifulSoup import requests import time import pymongo import random client = pymongo.MongoClient('localhost', 27017) ganji = client['ganji'] url_list = ganji['url_list'] item_info = ganji['item_info'] headers = { 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36', 'Connection':'keep-alive' } # http://cn-proxy.com/ 网站提供了一些访问速度比较好的IP proxy_list = [ 'http://117.177.250.151:8081', 'http://111.85.219.250:3129', 'http://122.70.183.138:8118', ] proxy_ip = random.choice(proxy_list) # 随机获取代理ip proxies = {'http': proxy_ip} # spider 1 def get_links_from(channel, pages, who_sells='o'): # http://bj.ganji.com/ershoubijibendiannao/o3/ # o for personal a for merchant list_view = '{}{}{}/'.format(channel, str(who_sells), str(pages)) wb_data = requests.get(list_view) soup = BeautifulSoup(wb_data.text, 'lxml') if soup.find('ul', 'pageLink'): for link in soup.select('.img a'): item_link = link.get('href') url_list.insert_one({'url': item_link}) #print(item_link) # return urls else: # It's the last page ! pass # spider 2 def get_item_info_from(url,data=None): wb_data = requests.get(url,headers=headers) if wb_data.status_code == 404: pass else: try: soup = BeautifulSoup(wb_data.text, 'lxml') data = { 'title':soup.title.text.strip(), 'price':soup.select('.f22.fc-orange.f-type')[0].text.strip(), 'pub_date':soup.select('.pr-5')[0].text.strip().split(' ')[0], 'area':list(map(lambda x:x.text,soup.select('ul.det-infor > li:nth-of-type(2) > a'))), #lambda 函数 匿名函数 'sales': soup.select('ul.det-infor > li:nth-of-type(4)')[0].text.strip().split(' ')[0], #'cates':list(soup.select('ul.det-infor > li:nth-of-type(1) > span')[0].stripped_strings), 'phone':soup.select('.phoneNum-style')[0].text.strip(), 'url':url } #print(data) #print(soup.select('ul.det-infor > li:nth-of-type(4)')) item_info.insert_one(data) except AttributeError: pass; except IndexError: pass; #get_item_info_from('http://bj.ganji.com/ershoubijibendiannao/33773929741240x.htm')
3、main.py
from multiprocessing import Pool from page_parsing import get_item_info_from,url_list,item_info,get_links_from from channel_extracing import channel_list db_urls = [item['url'] for item in url_list.find()] index_urls = [item['url'] for item in item_info.find()] x = set(db_urls) y = set(index_urls) rest_of_urls = x-y def get_all_links_from(channel): for i in range(1,100): get_links_from(channel,i) if __name__ == '__main__': pool = Pool() # pool = Pool() pool.map(get_all_links_from,channel_list.split()) pool.close() pool.join()