Scrapy笔记
scrapy使用
1、创建scrapy项目
scrapy startproject mySpider
2、编写第一个scrapy爬虫
#可以用于调试xpath或css选择器
scrapy shell https://tieba.baidu.com/f?kw=%E6%9D%8E%E6%AF%85%E5%90%A7
运行爬虫
scrapy crawl first_spider #first_spider是你在类中定义的name值
爬取李毅吧下的第一页贴子链接和文本:
import scrapy
class firstSpider(scrapy.Spider):
# # 构造方法
# def __init__(self, name, age):
# self.name = name
# self.age = age
name = 'first_spider'
def start_requests(self):# 此方法用于通过链接爬取页面
# 爬取页面的链接
urls = [
"https://tieba.baidu.com/f?kw=%E6%9D%8E%E6%AF%85%E5%90%A7"
]
yield scrapy.Request(urls[0], callback=self.parse) # 爬取页面内容后如何处理通过self.parse来执行
# 定义回调函数
def parse(self, response):
# 指定规则下的所有a标签元素
xpath='//ul[@id="thread_list"]/li[@class=" j_thread_list clearfix"]//div[@class="threadlist_lz clearfix"]/div/a'
link_list = response.xpath(xpath)
for link in link_list:
href = link.xpath('@href').extract_first()
text = link.xpath('text()').extract_first()
print('text: %s, href: %s' % (text, href))
if __name__ == '__main__':
pass
# 获得类的实例
# a = firstSpider('张三', 18)
# print('name: %s, age: %d' % (a.name, a.age))
爬取李毅吧下的下一页贴子链接和文本:
import scrapy
class firstSpider(scrapy.Spider):
# # 构造方法
# def __init__(self, name, age):
# self.name = name
# self.age = age
name = 'second_spider'
def start_requests(self): # 此方法用于通过链接爬取页面
# 爬取页面的链接
urls = [
"https://tieba.baidu.com/f?kw=%E6%9D%8E%E6%AF%85%E5%90%A7"
]
yield scrapy.Request(urls[0], callback=self.parse) # 爬取页面内容后如何处理通过self.parse来执行
# 定义回调函数
def parse(self, response):
# 指定规则下的所有a标签元素
xpath='//ul[@id="thread_list"]/li[@class=" j_thread_list clearfix"]//div[@class="threadlist_lz clearfix"]/div/a'
link_list = response.xpath(xpath)
filename = '李毅吧贴子内容.txt'
for link in link_list:
href = link.xpath('@href').extract_first()
text = link.xpath('text()').extract_first()
line = 'text: %s, href: %s' % (text, href)
print(line)
# 写文件
# a 追加到文件末尾
with open(filename, 'a', encoding='utf-8',) as f:
f.write(line)
f.write('\n')
# 爬取下一页的链接地址:css选择器
next_page = response.css('#frs_list_pager a:nth-last-child(2)::attr(href)').extract_first()
# 判断下一页是否不为空
if next_page is not None:
next_page = 'https:'+next_page
yield scrapy.Request(next_page, callback=self.parse)
if __name__ == '__main__':
pass
# 获得类的实例
# a = firstSpider('张三', 18)
# print('name: %s, age: %d' % (a.name, a.age))
urls的简写形式
import scrapy
class firstSpider(scrapy.Spider):
# # 构造方法
# def __init__(self, name, age):
# self.name = name
# self.age = age
name = 'three_spider'
# 简化形式
start_urls = [
"https://tieba.baidu.com/f?kw=%E6%9D%8E%E6%AF%85%E5%90%A7",
"https://www.baidu.com"
]
# def start_requests(self): # 此方法用于通过链接爬取页面
# # 爬取页面的链接
# urls = [
# "https://tieba.baidu.com/f?kw=%E6%9D%8E%E6%AF%85%E5%90%A7"
# ]
#
# yield scrapy.Request(urls[0], callback=self.parse) # 爬取页面内容后如何处理通过self.parse来执行
# 定义回调函数
def parse(self, response):
# 指定规则下的所有a标签元素
xpath='//ul[@id="thread_list"]/li[@class=" j_thread_list clearfix"]//div[@class="threadlist_lz clearfix"]/div/a'
link_list = response.xpath(xpath)
for link in link_list:
href = link.xpath('@href').extract_first()
text = link.xpath('text()').extract_first()
print('text: %s, href: %s' % (text, href))
if __name__ == '__main__':
pass
# 获得类的实例
# a = firstSpider('张三', 18)
# print('name: %s, age: %d' % (a.name, a.age))