scrapy 入门
1.创建一个scrapy项目
scrapy startproject myspider
2.生成一个爬虫
scrapy genspider itcast itcast.cn
3.提取数据
完善spider,使用xpath等方法
4.保存数据
pipeline中保存数据
解释
class ItcastSpider(scrapy.Spider):
name = 'itcast' #爬虫名字
allowed_domains = ['itcast.cn'] #允许爬取的范围
start_urls = ['https://maoyan.com/board'] #最开始请求的url范围
ret1 = response.xpath("//div[@class='maincon']//h2/text()").extract()
extract解释:提取所有的文字信息。
操作命令
cd +文件名:进入到项目文件内。
tree:查看树。
scrapy crawl itcast:启动爬虫。
dir:查看当前目录中的文件和文件夹;
学习进度
itcast.py文件
①
import scrapy
class ItcastSpider(scrapy.Spider):
name = 'itcast'
allowed_domains = ['itcast.cn']
start_urls = ['https://www.itcast.cn/channel/teacher.shtml']
def parse(self, response):
#处理start_urls地址对应的响应
ret1 = response.xpath("//div[@class='maincon']//h2/text()").extract()
print(ret1)
②
import scrapy
class ItcastSpider(scrapy.Spider):
name = 'itcast'
allowed_domains = ['itcast.cn']
start_urls = ['https://www.itcast.cn/channel/teacher.shtml']
def parse(self, response):
#处理start_urls地址对应的响应
#ret1 = response.xpath("//div[@class='maincon']//h2/text()").extract()
#print(ret1)
#分组
li_list = response.xpath("//div[@class='maincon']//li")
for li in li_list:
item = {}
item["name"] = li.xpath(".//h2/text()").extract()[0]
item["title"] = li.xpath(".//span/text()").extract()[0]
#item["入职时间"] = li.xpath(".//h3/text()").extract()
print(item)
完整项目
爬虫文件
import scrapy
import json
import requests
import numpy as np
import pandas as pd
from pandas import DataFrame
#2067111119 朱博
#由于内蒙古科技大学acm源码不可看,我发现如果获取到公开的URl接口,用json的方法获取,后转为python对象,也可以实现了作业需要的功能。
#在设置里开启了LOG_LEVEL="WARNING"过滤掉了日志,方便调试。
class PaimingSpider(scrapy.Spider):
name = 'paiming'
allowed_domains = ['paiming.cn']
start_urls = ['https://imustacm.cn/#/imustoj/rankList/1']
def parse(self, response):
url = "https://imustacm.cn/api/problem/listRank/1"
response = requests.get(url=url).text
json_res = json.loads(response)
user_list = json_res['data']['us']
#创建ndarray
x = np.array(user_list, dtype =str) #将转为py对象的user_list存入x为作业二准备
#创建Series
y = pd.Series(user_list) #将转为py对象的user_list存入y为作业三准备
#创建DataFrame
z = DataFrame(user_list) #将转为py对象的user_list存入z为作业四准备
a_list={}
for user in user_list:
a_list["user_id"]=user["user_id"]
a_list["nick"]=user["nick"]
a_list["professional"]=user["professional"]
a_list["school"]=user["school"]
if a_list["professional"]==None:
a_list["professional"]='未知'
if a_list["school"]==None:
a_list["school"]='未知'
yield(a_list)
#scrapy crawl paiming 运行爬虫文件可以完整的爬取排行榜的各项数据,并且完整的存入csv文件,经过检测,完成了作业一的各项要求。
管道
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class ImustacmPipeline:
with open('D:/内蒙古科技大学acm排名表.csv','a', encoding="utf-8") as a:
a.write("姓名"+","+"学号"+","+"专业"+","+"学校"+"\n")
def process_item(self, item, spider):
print(item)
with open('D:/内蒙古科技大学acm排名表.csv','a', encoding="utf-8") as a:
a.write(item["user_id"]+","+item["nick"]+","+item["professional"]+","+item["school"]+"\n")
return item
设置文件
# Scrapy settings for imustacm project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'imustacm'
SPIDER_MODULES = ['imustacm.spiders']
NEWSPIDER_MODULE = 'imustacm.spiders'
LOG_LEVEL="WARNING"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'imustacm (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'imustacm.middlewares.ImustacmSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'imustacm.middlewares.ImustacmDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'imustacm.pipelines.ImustacmPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
numpy
import numpy as np
import pandas as pd
import json
import os
import csv
with open('内蒙古科技大学acm排名表.csv',"r",encoding="utf-8") as a:
#city_list = json.load(a)
reader = csv.reader(a)
list=[]
for row in reader:
list.append(row)
x = np.array(list, dtype =str)
#数据的截取 截取了前20位同学的信息。
y=x[1:20]
#数据的排序,按第一个元素从小到大进行排序。
z=np.lexsort([y[:,2], y[:,1], y[:,0]])
sorted_data = y[z, :]
print(sorted_data)
series
import pandas as pd
import csv
with open('内蒙古科技大学acm排名表.csv',"r",encoding="utf-8") as a:
#city_list = json.load(a)
reader = csv.reader(a)
list=[]
for row in reader:
list.append(row)
y = pd.Series(list)
#print(y)
#作业一结果的Series,进行截取操作,这里截取了前20位同学的排名信息。
x=y[1:21]
print(x)
DataFrame
import numpy as np
import pandas as pd
import json
import os
import csv
from pandas import DataFrame
with open('内蒙古科技大学acm排名表.csv',"r",encoding="utf-8") as a:
reader = csv.reader(a)
list=[]
for row in reader:
list.append(row)
#DataFrame对象有所不同,它有两个索引数组。
#分组
z = DataFrame(list[1:21], columns=["学号", "姓名", "专业","学校"])
print(z)
#使用count函数计算得出前二十位同学的专业统计总数。
g = z.groupby(['专业']).count()
print(g)