python爬虫成长之路(三):基础爬虫架构及爬取证券之星全站行情数据

       爬虫成长之路(一)里我们介绍了如何爬取证券之星网站上所有A股数据,主要涉及网页获取和页面解析的知识。爬虫成长之路(二)里我们介绍了如何获取代理IP并验证,涉及了多线程编程和数据存储的知识。此次我们将在前两节的基础上,对证券之星全站的行情数据进行爬取。第一节的思路爬一个栏目的数据尚可,爬上百个栏目的数据工作量就有点大了。下面我们先介绍下基础的爬虫架构。

       本文主要包含爬虫框架六大基础模块,分别为爬虫调度器、URL下载器、URL管理器、HTML下载器、HTML解析器、数据存储器。功能分析如下

       爬虫调度器:主要负责统筹其他四个模块的工作。

       URL下载器:主要负责下载需要爬取数据的URL链接。

       URL管理器:负责管理URL链接,维护已经爬取的URL集合和未爬取的URL集合,提供获取新URL链接的接口。

       HTML下载器:用于从URL管理器中获取未爬取的URL链接并下载HRML网页。

       HTML解析器:用户从HTML下载器中获取已经下载的HTML网页,解析出有效数据交给数据存储器。

       数据存储器:用于将HTML解析器解析出来的数据通过文件或者数据库的形式储存起来。

       为了方便理解,以下是基础爬虫框架运行流程示意图

      

       此处介绍文件夹,下面,我们对这6大模块进行详细的介绍。

扫描二维码关注公众号,回复: 1648160 查看本文章

       一、URL下载器

       URL下载器包含两步,首先下载网站左侧导航栏的URL,然后通过导航栏的URL获取每个子栏目包含的链接列表。

      

       下面是获取左侧导航栏所有链接并生成导航文件的代码

# -*- coding: utf-8 -*-
import pandas as pd
import urllib.request from bs4 import BeautifulSoup import re import os class get_catalog(object): '''生成和操作导航文件''' def save_catalog(self): '''获得证券之星左侧自导航的内容和网址并保存''' #获取网页内容 url = 'http://quote.stockstar.com' request =urllib.request.Request(url = url) response = urllib.request.urlopen(request) content = response.read().decode('gbk') #截取左侧导航内容 soup = BeautifulSoup(content,"lxml") soup = BeautifulSoup(str(soup.find_all('div',class_ = "subMenuBox")),"lxml") #初始化一级子目录和二级子目录的数据框 catalog1 = pd.DataFrame(columns = ["cata1","cata2","url2"]) catalog2 = pd.DataFrame(columns = ["url2","cata3","url3"]) #整理目录内容和其对应的链接 index1 = 0;index2 = 0 for content1 in soup.find_all('div',class_ = re.compile("list submenu?")): cata1 = re.findall('>(.*?)<',str(content1.h3.a)) for content2 in content1.find_all('dl'): cata2 = re.findall('>(.*?)<',str(content2.dt.a).replace('\r\n','')) url2 = url + content2.dt.a['href'] catalog1.loc[index1] = {'cata1':cata1[0],'cata2':cata2[0].split()[0],'url2':url2} index1 += 1 for content3 in content2.find_all('li'): cata3 = re.findall('·(.*?)<',str(content3.a)) url3 = url + content3.a['href'] catalog2.loc[index2] = {'url2':url2,'cata3':cata3[0],'url3':url3} index2 += 1 #对一级子目录表和二级子目录表做表连接并保存 catalog = pd.merge(catalog1,catalog2,on='url2',how='left') catalog.to_csv('catalog.csv') def load_catalog(self): '''判断导航文件是否存在并载入''' if 'catalog.csv' not in os.listdir(): self.save_catalog() print('网址导航文件已生成') else: print('网址导航文件已存在') catalog = pd.read_csv('catalog.csv',encoding='gbk',usecols=range(1,6)) print("网址导航文件已载入") return(catalog) def index_info(self,catalog,index): '''创建每行的行名,作为存入数据库的表名,并获取每行终端的网址链接''' if str(catalog.loc[index]['cata3'])=='nan': table_name = catalog.loc[index]['cata1'] + '_' + catalog.loc[index]['cata2'] url = catalog.loc[index]['url2'] else: #+、()等符号不能作为数据库表名,得替换或剔除 if '+' in catalog.loc[index]['cata3']: cata3 = catalog.loc[index]['cata3'].replace('+','') table_name = catalog.loc[index]['cata1'] + '_' + catalog.loc[index]['cata2'] + '_' + cata3 elif '(' in catalog.loc[index]['cata3']: cata3 = catalog.loc[index]['cata3'].replace('(','').replace(')','') table_name = catalog.loc[index]['cata1'] + '_' + catalog.loc[index]['cata2'] + '_' + cata3 else: table_name = catalog.loc[index]['cata1'] + '_' + catalog.loc[index]['cata2'] + '_' + catalog.loc[index]['cata3'] url = catalog.loc[index]['url3'] return(table_name,url)
get_catalog

       下面是获取每个子栏目所有链接的代码

import pandas as pd
from selenium import webdriver
import time import re import math from get_catalog import get_catalog class get_urls(object): '''获取每个栏目的链接列表''' def __init__(self,browser,url): self.browser = browser #浏览器对象 self.url = url #待爬取的URL def get_browser(self): '''连接URL''' state = 0 test = 0 while state == 0 and test < 5: try: self.browser.get(self.url) state = 1 print('成功连接 %s'%self.url) except: test += 1 def get_element(self): '''获取翻页相关按钮的链接列表''' self.get_browser() element_list=[] for i in range(1,8): try: element = self.browser.find_element_by_xpath('//*[@id="divPageControl1"]/a[%d]'%i).get_attribute('href') element_list.append(element) except: time.sleep(0.2) return(element_list) def get_urllist(self): '''通过翻页相关按钮生成有效的页码链接列表''' element_list = self.get_element() if len(element_list)<=1: urls = [self.url] else: try: max_number = re.search('_(\d*)\.',element_list[len(element_list)-3]) begin = max_number.start() + 1 end = max_number.end() - 1 int_max_number = int(element_list[len(element_list)-3][begin:end]) urls = [] for i in range(1,int_max_number + 1): url = element_list[len(element_list)-3][:begin] + str(i) + element_list[len(element_list)-3][end:] urls.append(url) except: urls = [self.url] return(urls)
get_urls

       二、URL管理器

       URL管理器主要包括两个变量,一个是已爬取的URL的 集合,另外一个是未爬取的URL的集合。采用Python中的set类型,主要是使用set的去重功能。

       URL管理器除了具有两个URL集合,还需要提供以下接口,用于配合其他模块使用,接口如下:

       判断是否有待取的URL,方法定义为has_new_url()。

       添加新的URL到未爬取集合中,方法定义为add_new_url(url),add_new_urls(urls)。

       获取一个未爬取的URL,方法定义为get_new_url()

       下面为URL管理器模块的代码

# coding:utf - 8
class UrlManager(object):
    '''URL管理器''' def __init__(self): self.new_urls = set() #未爬取URL集合 self.old_urls = set() #已爬取URL def has_new_url(self): '''判断是否有未爬取的URL''' return(self.new_url_size()!=0) def get_new_url(self): '''获取一个未爬取的URL''' new_url = self.new_urls.pop() self.old_urls.add(new_url) return(new_url) def add_new_url(self,url): '''将新的URL添加到未爬取的URL集合中''' if url is None: return if url not in self.new_urls and url not in self.old_urls: self.new_urls.add(url) def add_new_urls(self,urls): '''将新的URL列表添加到未爬取的URL集合中''' if urls is None or len(urls)==0: return for url in urls: self.add_new_url(url) def new_url_size(self): '''获取为爬取URL集合的大小''' return(len(self.new_urls))
UrlManager

       三、HTML下载器

       HTML下载器用来下载网页,这时候需要注意网页的编码,已保证下载的网页没有乱码。

       获取网页内容时可能会遇到IP被封的情况,所以我们得爬取一个代理IP池,供HTML下载器使用。

       下面是获取代理IP池的代码

import urllib.request
import re
import time import random import socket import threading class proxy_ip(object): '''获取有效代理IP并保存''' def __init__(self,url,total_page): self.url = url #打算爬取的网址 self.total_page = total_page #遍历代理IP网页的页数 def get_proxys(self): '''抓取代理IP''' user_agent = ["Mozilla/5.0 (Windows NT 10.0; WOW64)", 'Mozilla/5.0 (Windows NT 6.3; WOW64)', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)', 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1', 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3', 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12', 'Opera/9.27 (Windows NT 5.2; U; zh-cn)', 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0', 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11'] ip_totle=[] for page in range(1,self.total_page+1): #url = 'http://www.httpsdaili.com/?page='+str(page) #url='http://www.kuaidaili.com/free/inha/'+str(page)+'/' url='http://www.xicidaili.com/nn/'+str(page) #西刺代理 headers={"User-Agent":random.choice(user_agent)} try: request=urllib.request.Request(url=url,headers=headers) response=urllib.request.urlopen(request) content=response.read().decode('utf-8') print('get page',page) pattern=re.compile('<td>(\d.*?)</td>') #截取<td>与</td>之间第一个数为数字的内容 ip_page=re.findall(pattern,str(content)) ip_totle.extend(ip_page) except Exception as e: print(e) time.sleep(random.choice(range(1,5))) #打印抓取内容 print('代理IP地址 ','\t','端口','\t','速度','\t','验证时间') for i in range(0,len(ip_totle),4): print(ip_totle[i],' ','\t',ip_totle[i+1],'\t',ip_totle[i+2],'\t',ip_totle[i+3]) #整理代理IP格式 proxys = [] for i in range(0,len(ip_totle),4): proxy_host = ip_totle[i]+':'+ip_totle[i+1] proxy_temp = {"http":proxy_host} proxys.append(proxy_temp) return(proxys) def test(self,lock,proxys,i,f): '''验证代理IP有效性''' socket.setdefaulttimeout(15) #设置全局超时时间 url = self.url try: proxy_support = urllib.request.ProxyHandler(proxys[i]) opener = urllib.request.build_opener(proxy_support) opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64)")] urllib.request.install_opener(opener) #res = urllib.request.urlopen(url).read().decode('gbk') res = urllib.request.urlopen(url).read().decode('utf-8') print(res) lock.acquire() #获得锁 print(proxys[i],'is OK') f.write('%s\n' %str(proxys[i])) #写入该代理IP lock.release() #释放锁 except Exception as e: lock.acquire() print(proxys[i],e) lock.release() def get_ip(self): '''多线程验证''' f = open('proxy_ip.txt','a+') #新建一个储存有效IP的文档 lock=threading.Lock() #建立一个锁 #多线程验证 proxys = self.get_proxys() threads=[] for i in range(len(proxys)): thread=threading.Thread(target=self.test,args=[lock,proxys,i,f]) threads.append(thread) thread.start() #阻塞主进程,等待所有子线程结束 for thread in threads: thread.join() f.close() #关闭文件
get_proxy_ip

       下面是HTML下载器模块的代码

# _*_ coding:utf-8 _*_
from firstSpider.get_proxy_ip import proxy_ip
import urllib.request import random import os import socket import time import re class HtmlDownloader(object): '''获取网页内容''' def download(self,url): user_agent = ["Mozilla/5.0 (Windows NT 10.0; WOW64)", 'Mozilla/5.0 (Windows NT 6.3; WOW64)', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)', 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1', 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3', 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12', 'Opera/9.27 (Windows NT 5.2; U; zh-cn)', 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0', 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11'] state = 0;test = 0 socket.setdefaulttimeout(20) #设置全局超时时间 while state == 0 and test < 5: try: request = urllib.request.Request(url=url,headers={"User-Agent":random.choice(user_agent)})#随机从user_agent列表中抽取一个元素 response = urllib.request.urlopen(request) readhtml = response.read() content = readhtml.decode('gbk') #读取网页内容 time.sleep(random.randrange(1,6)) if re.search('Auth Result',content) == None: state = 1 except Exception as e: print('系统IP获取网页失败','',e) if 'proxy_ip.txt' not in os.listdir() or os.path.getsize('proxy_ip.txt') == 0: print('代理IP池不存在,新建代理IP池') pool = proxy_ip(url,5) pool.get_ip() print('代理IP池创建完毕') else: f = open('proxy_ip.txt','r') proxys_ip = f.readlines() f.close() random.shuffle(proxys_ip) for i in range(len(proxys_ip)): try: proxy_support = urllib.request.ProxyHandler(eval(proxys_ip[i][:-1])) opener = urllib.request.build_opener(proxy_support) opener.addheaders=[("User-Agent",random.choice(user_agent))] urllib.request.install_opener(opener) response = urllib.request.urlopen(url) readhtml = response.read() content = readhtml.decode('gbk') time.sleep(random.randrange(1,6)) if re.search('Auth Result',content) == None: #排除被判别为无效用户的情况 state = 1 print('成功接入代理IP',proxys_ip[i]) break except Exception as e: print(proxys_ip[i],'请求失败',e) except urllib.error.HTTPError as e: print(proxys_ip[i],'请求失败',e.code) except urllib.error.URLError as e: print(proxys_ip[i],'请求失败',e.reason) try: if i == len(proxys_ip)-1: os.remove('proxy_ip.txt') print('代理IP池失效,已删除') except: #i不存在的情况 os.remove('proxy_ip.txt') print('代理IP池为空,文件已删除') time.sleep(60) test += 1 if test == 5: print('未成功获取 %s 页面内容'%url) content = None return(content)
HtmlDownloader

       四、HTML解析器

       HTML解析器主要对HTML下载器下载的网页内容进行解析,提取想要的内容。

       本文用到的网页解析方法主要是正则表达式和BeautifulSoup,下面是HTML解析器的代码

# coding:utf-8
import re
from bs4 import BeautifulSoup import pandas as pd import urllib.request import numpy as np import time import datetime class HtmlParser(object): '''解析网页内容''' def __init__(self,content): self.soup = BeautifulSoup(content,"lxml") #待解析内容 def get_header(self): '''获取表格标题''' try: header = [] for tag in self.soup.thead.find_all('td'): title = str(tag) title = title.replace(' ','') title = title.replace('\n','') header.extend(re.findall('>(.*?)<',title)) header_name = [] for data in header: if data != '': header_name.append(data.strip()) header_name.append('数据时间') except: #无标题返回空列表,标记了该内容是否有效 header_name = [] return(header_name) h2_len = len(self.soup.thead.find_all('td',class_ = "h2")) datalist_len = len(self.soup.find_all('tbody',id="datalist") + self.soup.find_all('tbody',id="datalist1") + self.soup.find_all('tbody',id="datalist2")) if h2_len >= 6 or datalist_len == 0: #排除了标题格式不统一和没数据的两种情况 header_name = [] return(header_name) def get_header2(self): '''获取表格标题(标题存在两层)''' stati_date = [] for date in self.soup.thead.find_all('td',class_ = "double align_center"): stati_date.extend(re.findall('>(.*?)<',str(date))) header_total = self.get_header() header_name = header_total[:-5] header_name = header_name[:2] + header_total[-5:-1] + header_name[2:] if stati_date[0] in header_name: header_name.remove(stati_date[0]) if stati_date[1] in header_name: header_name.remove(stati_date[1]) header_name.append('三四列统计时间') header_name.append('五六列统计时间') header_name.append('数据时间') return(header_name,stati_date) def get_datatime(self): '''获取数据时间''' try: date = re.findall('数据时间:(.*?)<',str(self.soup.find_all('span',class_ = "fl")))[0][0:10] except: #若不存在,根据系统时间推断 now_time = time.localtime() if time.strftime("%w",now_time) in ['1','2','3','4','5']: date = time.strftime("%Y-%m-%d",now_time) elif time.strftime("%w",now_time) == '6': dt = (datetime.datetime.now() - datetime.timedelta(days = 1)) date = dt.strftime("%Y-%m-%d") else: dt = (datetime.datetime.now() - datetime.timedelta(days = 2)) date = dt.strftime("%Y-%m-%d") return(date) def get_datalist(self): '''获取数据内容''' if len(self.soup.find_all('tbody',id="datalist")) >= 1: soup = BeautifulSoup(str(self.soup.find_all('tbody',id="datalist")[0]),"lxml") elif len(self.soup.find_all('tbody',id="datalist1")) >= 1: soup = BeautifulSoup(str(self.soup.find_all('tbody',id="datalist1")[0]),"lxml") else: soup = BeautifulSoup(str(self.soup.find_all('tbody',id="datalist2")[0]),"lxml") date = self.get_datatime() row = len(soup.tbody.find_all('tr')) #初始化正常标题和双重标题时的数组 if len(self.soup.thead.find_all('td',class_ = "double align_center")) == 0: header_name = self.get_header() col = len(header_name) datalist = np.array(['']*(row * col),dtype = 'U24').reshape(row,col) flag = 1 else: header_name = self.get_header2()[0] col = len(header_name) datalist = np.array(['']*(row * col),dtype = 'U24').reshape(row,col) flag = 2 for i in range(row): #提取数据并写入数组 detail = re.findall('>(.*?)<',str(soup.find_all('tr')[i])) for blank in range(detail.count('')): detail.remove("") try: if flag == 1: detail.append(date) datalist[i] = detail elif flag == 2: stati_date = self.get_header2()[1] detail.append(stati_date[0]) detail.append(stati_date[1]) detail.append(date) datalist[i] = detail except: datalist[i][0] = detail[0] datalist[i][col-1] = date return(datalist,header_name) def get_dataframe(self): '''组合标题和数据数据为数据框并输出''' datalist,header_name = self.get_datalist() table = pd.DataFrame(datalist ,columns = header_name) return(table)
HtmlParser

       五、数据存储器

       数据存储器主要对解析器解析的数据进行存储,存储方式有很多种,本文选用MYSQL数据库进行存储。

       解析器把每一页的股票数据存为了一个数据框,然后通过数据库连接引擎,把数据框的数据直接存入数据库。

       以下是数据存储器的模块的代码

import pymysql
from sqlalchemy import create_engine
import pandas as pd from firstSpider.HtmlParser import HtmlParser class DataOutput(object): '''把数据存入MYSQL数据库''' def __init__(self,engine,table,table_name): self.engine = engine #数据库连接引擎 self.table = table #要储存的表 self.table_name = table_name #表名 def output(self): self.table.to_sql(name = self.table_name,con = self.engine,if_exists = 'append',index = False,index_label = False)
DataOutput

       六、爬虫调度器

       爬虫调度器主要将上述几个模块组合起来,合理的分工,高效完成任务。

       爬虫调度器采用进程池的方式加快了程序执行的效率,下面是爬虫调度器模块的代码

from firstSpider.UrlManager import UrlManager
from firstSpider.HtmlDownloader import HtmlDownloader from firstSpider.HtmlParser import HtmlParser from firstSpider.DataOutput import DataOutput from sqlalchemy import create_engine import threadpool,time class SpiderMan(object): '''爬虫机器人''' def __init__(self,engine,table_name): self.engine = engine #数据库连接引擎 self.table_name = table_name #表名 self.manager = UrlManager() #URL管理器 self.downloader = HtmlDownloader() #HTML下载器 def spider(self,url): '''单网页爬虫组件''' # HTML下载器下载网页 html = self.downloader.download(url) f = open('stock.txt','w') f.write(html) f.close() # HTML解析器抽取网页数据 parser = HtmlParser(html) if len(parser.get_header()) > 0: data = parser.get_dataframe() # 数据储存器储存文件 out = DataOutput(self.engine,data,self.table_name) out.output() print('%s 的数据已存入表 %s'%(url,self.table_name)) time.sleep(1) return(parser.get_datatime()) def crawl(self,urls): '''爬取一个栏目连接列表的内容''' self.manager.add_new_urls(urls) # 判断url管理器中是否有新的url pool = threadpool.ThreadPool(10) while(self.manager.has_new_url()): # 从URL管理器获取新的url new_url = self.manager.get_new_url() requests = threadpool.makeRequests(self.spider,(new_url,)) pool.putRequest(requests[0]) pool.wait()
SpiderMan

       将上述每个模块的代码都新建一个py文件放在firstSpider文件夹下,并运行如下主程序即可获取证券之星全站的股票数据

from firstSpider.get_proxy_ip import proxy_ip
from firstSpider.get_catalog import get_catalog from firstSpider.get_urls import get_urls from firstSpider.SpiderMan import SpiderMan from selenium import webdriver from sqlalchemy import create_engine import time '''根据左侧子导航下载证券之星当天所有数据''' if __name__ == "__main__": print('获取代理IP并验证有效性') ip_pool = proxy_ip('http://quote.stockstar.com',8) ip_pool.get_ip() print('代理IP池建立完毕') getcata = get_catalog() catalog = getcata.load_catalog() start = 0 end = len(catalog) catalog = catalog[start : end] print('初始化浏览器') browser = webdriver.Chrome() engine = create_engine('mysql+pymysql://root:Jwd116875@localhost:3306/scott?charset=utf8') for index in range(start,end): table_name,url = getcata.index_info(catalog,index) stop_url = ['http://quote.stockstar.com/gold/globalcurrency.shtml'] #想过滤掉的网页链接 if url not in stop_url: geturls = get_urls(browser,url) urls = geturls.get_urllist() print('已获取 %s 的链接列表'%table_name) Spider_man = SpiderMan(engine,table_name) Spider_man.crawl(urls) datatime = Spider_man.spider(urls[0]) print('%s: %s 栏目 %s 的增量数据爬取完毕'%(index,table_name,datatime))
main

      麻雀虽小五脏俱全,以上是用简单的爬虫框架实现的一次全站内容爬取,在执行速度和程序伪装上还有很大提升空间,希望能够与大家一同交流成长。

猜你喜欢

转载自www.cnblogs.com/sjzh/p/7657882.html