该文章是自己在初学爬虫时,选择了 看书海 这个网站进行爬虫,慢慢地学习新的内容(多线程,代理).
整体思路为四步走
一、页面爬虫
连载书库 这个链接是文章列表,该站所有的小说都在里面. 共有2000多页. 目的就是获取到2000多页的链接.
二、书本链接爬虫
根据第一步获得的2000多个链接,爬取其书本的URL
三、单个书本所有章节爬虫
根据第二步爬取的书本URL,爬取每一本书的每一个单章的链接
四、单章爬虫(代理+多线程)
利用上面获得的章节链接,1个1个进行爬虫.
一、爬取书库所有页面,并储存到数据库
创建一个数据库表,结构如图.
import requests from bs4 import BeautifulSoup from lxml import etree #用来计算sel和result. 及解析html中的网址路径 import time import datetime import numpy import mysql.connector database = mysql.connector.connect(user='XXXXX', password='XXXXX', database='看书海') cursor = database.cursor() headers = {'content-type': 'application/json', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'} def get_pageurl(): global url while True: r = requests.get(url,headers=headers) html = r.text.encode(r.encoding).decode('GB18030') soup= BeautifulSoup(html,'lxml') soup=soup.find("div", {"class":"pagelink","id": "pagelink"}) sel = etree.HTML(str(soup)) result = set(sel.xpath('//a/@href')) for row in result: try: cursor.execute("INSERT INTO PageList(pageid,pageurl) value(%s,%s)",(row[row.find('_')+1:row.find('.html')],str(row))) database.commit() print(row[row.find('_')+1:row.find('.html')],'done') except : print(row[row.find('_')+1:row.find('.html')],'pass') continue url = sel.xpath('//a[@class="ngroup"]/@href')[0] #url为 > 箭头所指的链接.这样子可以少爬两次. time.sleep(numpy.random.randint(1,5)) print('done') falltimes = 0 def start(): global falltimes try: get_pageurl() except: falltimes +=1 print('fall',falltimes,'次') time.sleep(10) start()
结果如图..请不要吐槽其中的规律...
二、查询书本链接
import requests from bs4 import BeautifulSoup from lxml import etree #用来计算sel和result. 及解析html中的网址路径 import time import datetime import numpy import mysql.connector database = mysql.connector.connect(user='root', password='啊啊啊啊', database='看书海') cursor = database.cursor() headers = {'content-type': 'application/json', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'} def get_bookurl(): cursor.execute("SELECT * FROM pagelist WHERE ISNULL(finished)") pagelist = cursor.fetchall() print('数据库读取完成!') print('从page',pagelist[0][0],'开始') for row in pagelist: time_1 = datetime.datetime.now() url = row[1] pageid=row[0] r = requests.get(url,headers=headers) html = r.text.encode(r.encoding).decode('GB18030') soup= BeautifulSoup(html,'lxml') soup=soup.find("div", {"id":"bookinfo"}) sel = etree.HTML(str(soup)) result = (sel.xpath('//dt//a[@target="_blank"]/@href')) for row in result: try: bookid= row[row.find('/',row.find('/',int(row.find('/',10)+1)+1)+1)+1: row.find('/',row.find('/',row.find('/',int(row.find('/',10)+1)+1)+1)+1)] cursor.execute("INSERT INTO booklist(pageid,bookid,bookurl) value (%s,%s,%s)",(pageid,bookid,str(row))) #将bookid和bookurl储存到数据库. bookid是唯一的. database.commit() except: print(bookid,'pass') continue time.sleep(numpy.random.randint(1,3)) print('完成了page',pageid,'耗时',(datetime.datetime.now()-time_1).seconds,'s') cursor.execute("UPDATE pagelist SET finished = %s where pageid = %s ",(1,pageid)) import traceback #用作报错追踪 def start(): falltimes=0 try: get_bookurl() except: traceback.print_exc() falltimes += 1 print('fall',falltimes,'times') time.sleep(100) start() time_start = datetime.datetime.now() start() print('总耗时',datetime.datetime.now()-time_start)
如上图. 不过这里的简介/作者,是在第三部添加上去的.
三、查询章节链接
这是最后一个表. chapterlist.
import requests from bs4 import BeautifulSoup from lxml import etree #用来计算sel和result. 及解析html中的网址路径 import time import datetime import numpy import mysql.connector headers = {'content-type': 'application/json', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'} #邮箱提醒 from email import encoders from email.header import Header from email.mime.text import MIMEText from email.utils import parseaddr, formataddr import smtplib def _format_addr(s): name, addr = parseaddr(s) return formataddr((Header(name, 'utf-8').encode(), addr)) def send_mail(sendMessage): from_addr = '啊啊啊@163.com' password = '啊啊啊啊' to_addr = '啊啊啊啊@foxmail.com' smtp_server = 'smtp.163.com' sendTitle = 'GET CHAPTER 爬取失败!' sendText = sendMessage msg = MIMEText(sendText, 'plain', 'utf-8') msg['From'] = _format_addr('Jack<%s>' % from_addr) msg['To'] = _format_addr(' <%s>' % to_addr) msg['Subject'] = Header(sendTitle, 'utf-8').encode() server = smtplib.SMTP(smtp_server, 25) server.login(from_addr, password) server.sendmail(from_addr, [to_addr], msg.as_string()) server.quit() #通过链接booklist,获取单本书的链接.并获得author/bookname/introduction. #finished=1代表完成. finished=2,代表getUrl未知错误. finished=3,代表UnicodeDecodeError def get_book(): database = mysql.connector.connect(user='root', password='yituba', database='看书海') cursor = database.cursor() global n cursor.execute("SELECT * FROM booklist where isnull(finished) ORDER BY bookid ;") booklist = cursor.fetchall() for book in booklist: time_1 = datetime.datetime.now() url=book[2] bookid=book[1] print(url,end=' ') try: r = requests.get(url,headers=headers,timeout=20) html = r.text.encode(r.encoding).decode('GB18030') except UnicodeDecodeError: print('UnicodeDecodeError !! continue') cursor.execute("UPDATE booklist SET finished = 3 WHERE bookid=%s"%str(bookid)) database.commit() send_mail('UnicodeDecodeError') continue except requests.exceptions.ConnectTimeout : print('Time out') send_mail('ConnectTimeout. Sleep 500 seconds') time.sleep(500) except : print('request url falls. sleep 300 seconds') cursor.execute("UPDATE booklist SET finished = 2 WHERE bookid=%s"%str(bookid)) database.commit() text2 = traceback.format_exc() send_mail(text2) time.sleep(300) continue soup= BeautifulSoup(html,'lxml') #书名 bookname = soup.find("div", {"id":"title"}).string[0:soup.find("div", {"id":"title"}).string.find('全文阅读')] #书籍简介 introction_list =[] for row in soup.find("div", {"class":"intro"}).stripped_strings: introction_list.append(row) introction=''.join(introction_list[1:]) #将list转化为字符串. 同时忽略第一个"内容简介:" #作者 author_list=[] for row in soup.find("div", {"class":"rep"}).strings: author_list.append(row) author = author_list[1] #将书名/简介/作者传入数据库 cursor.execute("UPDATE booklist SET bookname=%s,introduction=%s,bookauthor=%s WHERE bookid=%s", (str(bookname),str(introction),str(author),bookid)) database.commit() #chapterlink sel = etree.HTML(str(soup.find("div",{"id":"book"}))) result = sel.xpath('//@href') chapter=[] n=1 for row in result: chapterurl = url+row bookid_str=str(bookid).zfill(5) n_str=str(n).zfill(5) chapter_id = '%s%s'%(bookid_str,n_str) try: cursor.execute("INSERT chapterlist(bookid,chapterid,chapterurl) value (%s,%s,%s)",(bookid,chapter_id,chapterurl)) except mysql.connector.errors.IntegrityError : print(chapter_id,'已存在') continue n=n+1 cursor.execute("UPDATE booklist SET finished = 1 WHERE bookid=%s"%str(bookid)) database.commit() print(bookid,bookname,'完成! 耗时',(datetime.datetime.now()-time_1).seconds,'seconds',' 共有',n,'章') time.sleep(numpy.random.randint(0,5)) database.close() import traceback def start(): try: get_book() except: sendMessage = '错误代码如下:\n'+traceback.format_exc() print('出现问题.等待100s后重新开始') send_mail(sendMessage) time.sleep(100) start() start()
这里比起之前主要是多了一个报错信息邮箱提醒. 这样方便让自己还知道爬虫是不是还在跑..
这个大概一两天就能跑完,一共是2000W行数据.大概就是2000万个章节.
(接下来需要爬虫..爬完这2000万个章节!!)
四、获取正文(代理+多线程)
获取正文这里就是大头了. 因为上面提到有2000万个url需要爬... 如果还是想上面那样,用sleep大法来的话,要爬好几年...
所以用代理,防止被封ip. 同时用多线程,加快访问速度.(反正都是在等I/O,不如大家一起跑)
import mysql.connector database = mysql.connector.connect(user='root', password='啊啊啊', database='看书海') cursor = database.cursor() import datetime import time import threading import requests from bs4 import BeautifulSoup from lxml import etree #用来计算sel和result. 及解析html中的网址路径 import traceback headers = {'content-type': 'application/json', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'} #邮箱提醒 from email import encoders from email.header import Header from email.mime.text import MIMEText from email.utils import parseaddr, formataddr import smtplib def _format_addr(s): name, addr = parseaddr(s) return formataddr((Header(name, 'utf-8').encode(), addr)) def send_mail(sendTitle,sendMessage): try: from_addr = '啊啊啊@163.com' password = '啊啊啊啊' to_addr = '啊啊啊@foxmail.com' smtp_server = 'smtp.163.com' sendText = sendMessage msg = MIMEText(sendText, 'plain', 'utf-8') msg['From'] = _format_addr('Jack<%s>' % from_addr) msg['To'] = _format_addr(' <%s>' % to_addr) msg['Subject'] = Header(sendTitle, 'utf-8').encode() server = smtplib.SMTP(smtp_server, 25) server.login(from_addr, password) server.sendmail(from_addr, [to_addr], msg.as_string()) server.quit() except: print('发送邮件失败') #这个函数主要是通过公式来算出proxies.方便下面利用requests进行使用.所以也要加上global def get_proxies(): global proxies # 要访问的目标页面 targetUrl = "http://test.abuyun.com" # 代理服务器 proxyHost = "http-pro.abuyun.com" proxyPort = "9010" # 代理隧道验证信息 proxyUser = "啊啊啊啊" #来自于阿里布. 这是供应商提供的. proxyPass = "啊啊啊啊" proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host" : proxyHost, "port" : proxyPort, "user" : proxyUser, "pass" : proxyPass, } proxies = { "http" : proxyMeta, "https" : proxyMeta, }
以上是一些比较基本的公式引用.
def get_list(): global value_database cursor.execute("SELECT c.bookid,c.chapterid,c.chapterurl,b.bookname FROM chapterlist as c \ left join booklist as b on b.bookid = c.bookid where isnull(c.chaptername) and isnull(c.iserror) limit 4") value_database= cursor.fetchall() print('读取数据库完成') #litmit 4 是为了限制多线程的数量... 之所以是4,不是因为四核,而是因为对方代理只允许1秒内访问4次.. error_number=0 def get_content(chapterid,url): global allcontent_list,error_number try: #利用try..except 来规避一些异常数据 r = requests.get(url,headers=headers,timeout=10,proxies=proxies) html = r.text.encode(r.encoding).decode('gb18030') except requests.exceptions.ReadTimeout: print(chapterid,'ReadTimeout',end=" ") error_number+=1 except requests.exceptions.ConnectionError: print(chapterid,'ConnectionError',end=" ") error_number+=1 except UnicodeDecodeError: #有些页面就是解码有问题..就是对方网页问题.这种直接添加一个iserror的异常,保证接下来不会被重复引用链接. cursor.execute("UPDATE chapterlist SET iserror = 1 where chapterid = %s"%(chapterid)) print('解码错误') error_number+=1 except TypeError: print(chapterid,'TypeError') error_number+=1 except: sendMessage = str(chapterid)+'错误代码如下:\n'+traceback.format_exc() send_mail("看书海多线程失败",sendMessage) print(chapterid,'出错',end=" ") else: soup= BeautifulSoup(html,'lxml') chapter_content_html = soup.find("div",{"id": "con_L"}) content=' ' for string in chapter_content_html.stripped_strings: content = content + string +'\n' + ' ' charpter_name=soup.h1.string content_list=[str(charpter_name),str(content),int(chapterid)] allcontent_list.append(content_list)
get_list(): 数据库中获得需要爬取的URL. 同时也是为了保证不会重复
get_content(chapterid,url): 通过chapterid和url直接获得content并存到数据库中.
设置多线程
def multi(): global allcontent_list allcontent_list=[] threads = [] for row in value_database: chapterid = row[1] url=row[2] t=threading.Thread(target=get_content,args=(chapterid,url)) threads.append(t) if __name__ == '__main__': for t in threads: t.setDaemon(True) t.start() for t in threads: t.join() #以下的考虑主要是多线程数据库读写会导致其报错. 因此将多线程的结果存在list中,等到多线程结束以后,再进行数据库读写.这样就不会错啦. for row in allcontent_list: cursor.execute("UPDATE chapterlist SET chaptername = %s ,content = %s where chapterid = %s", (row[0],row[1],row[2])) database.commit() print('传入数据库完成')
n=0 #报错次数 get_proxies() #获取代理的链接 while True: t1=datetime.datetime.now() get_list() multi() n+=1 print('已完成',n,'All done.耗时',(datetime.datetime.now()-t1).seconds,'秒',"失败了"+str(error_number)+"次") if n%200==0: #每200次进行一次报错.提醒自己进度如何了 sendMessage='已完成'+str(n)+'次'+'失败了'+str(error_number)+'次' send_mail("看书海多线程进度汇报",sendMessage)
以上爬虫写得非常粗糙,但还是能基本上把看书海这个网站的小说爬下来.
代理是付费的,1块钱1个小时,我爬了一个小时就不爬了,免得对服务器造成冲击.
感觉有很多很多地方可以提高了. 不知道此文是否有人会看得到,希望能得到大家指点,谢谢!