scrapy具体介绍就不用说了,自己百度一下。或者参考以下文档
https://blog.csdn.net/u011054333/article/details/70165401
直接在cmd里运行
scrapy startproject huaidan
scrapy genspider huaidan huaida4.com
然后贴代码放到spiders文件夹里
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.http import Request 4 from urllib import parse 5 import re 6 7 class huaidan(scrapy.Spider): 8 name = "huaidan" 9 allowed_domains = ["www.huaidan4.com"] 10 start_urls = ["http://www.huaidan4.com/di-yi-fen-lei.html", 11 "http://www.huaidan4.com/di-er-juan.html", 12 "http://www.huaidan4.com"] 13 14 15 #提取下一页文章url交给scrpy进行下载 16 def parse(self, response): 17 #获取文章url 18 all_article=response.css('.container ul li a::attr(href)').extract() 19 all_url=[] 20 for article_url in all_article: 21 if article_url in all_url: 22 pass 23 else: 24 all_url.append(article_url) 25 yield Request(url=article_url,encoding='utf-8',callback=self.parse_detail) 26 27 28 29 30 #提取文章的具体字段 31 def parse_detail(self,response): 32 #获取文章标题 33 article_title = response.xpath('//*[@id="content"]/div[1]/div[1]/h2/text()').extract_first() 34 35 #获取创建时间 36 create_time = response.xpath('//*[@id="content"]/div[1]/div[1]/span/text()[2]').extract_first().strip() 37 38 #获取文章正文 39 article_text = response.css('.post_entry,p::text').extract_first() 40 #处理正文标点符号和无用的信息 41 article_text = re.sub('</?\w+[^>]*>','',article_text) 42 article_text = article_text.replace("\', \'","") 43 article_text = article_text.replace("\\u3000","").strip() 44 article_text = article_text.replace("\\xa0\\xa0\\xa0\\xa0","") 45 article_text = article_text.replace("(新书上传,求收藏,推荐!!!!!!!!!!!!!!!!!!!!)","") 46 article_text = article_text.replace("\\r\\n", "\n") 47 article_text = article_text.replace("免费小说", "") 48 article_text = article_text.replace("www.huaidan4.com", "") 49 article_text = article_text.replace("neirong_2();", "") 50 article_text = article_text.replace("dibutuijian();", "") 51 article_text = article_text.replace("◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。", "") 52 article_text = article_text.replace("《坏蛋是怎样炼成的4》是继曹三少坏蛋是怎样炼成的3的又一作品,作者是曹三少,如果你喜欢坏蛋是怎样炼成的4,请收藏本站以便下次阅读。","") 53 article_text = re.sub('/?\s+', '', article_text) 54 55 #保存文件 56 self.save_article(article_title,create_time,str(article_text)) 57 58 #保存文件的方法 59 def save_article(self,article_title,create_time,article_text): 60 biaoti = re.sub('\W+','-',article_title) 61 with open(biaoti+'.txt','w',encoding='utf-8') as file: 62 neirong = (article_title+'\n'+create_time+'\n'+article_text) 63 file.write(neirong) 64 file.close()