一、Scrapy
爬取某网页的音乐的各个信息
在所需的文件中所在命令窗口创建一个项目
步骤1、在items文件中写入需要的类
import scrapy
from scrapy import Field,Item
class TestwangyiItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title=Field()
number=Field()
music_name=Field()
img_src=Field()
music_src=Field()
music_singer=Field()
pass
步骤2、在spiders文件中新建文件写爬虫内容
from scrapy.spiders import CrawlSpider
from scrapy.selector import Selector
from testwangyi.items import TestwangyiItem
import re
class music(CrawlSpider):
name = 'wangyi'
start_urls=['http://xyq.163.com/download/down_music.html']
def parse(self, response): #parse函数,默认是执行stat_urls的网址的全部内容
# print(response.text)
item=TestwangyiItem() ######items中的类文件
reg=re.compile('.*下载.*')
selector=Selector(response)
# 获取所有a
# all_a=selector.xpath("//a[@download]").re(reg)
# all_a = selector.xpath("//a[@download]").extract() #字符串列表
all_a=selector.css('td.tTitle ::text').extract()
#table下所有的A
# all_a = selector.xpath("//table//tr").extract()
for i in all_a:
item['title'] = i
yield item #返回items运行的类
# num=i.xpath('//td').extract()
# print(num)
步骤3、在settings文件中设置代理
############头部代理
USER_AGENT='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3)'\
' AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
FEED_URI=u'file:///D:/pycharm2017.3.2/work/scrapy 0608/doubanTU/douban.csv'
#保存的文件位置和文件名字
FEED_FORMAT = 'CSV'
#保存的文件格式
# Obey robots.txt rules #是否遵守协议
ROBOTSTXT_OBEY = True
'''DOWNLOAD_DELAY = 3 延迟显示##############'''
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
#默认的请求头部
Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
##########中间键
Enable or disable spider middlewares
See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
'testwangyi.middlewares.TestwangyiSpiderMiddleware': 543,
}
Enable or disable downloader middlewares ###########启用或停用中间键
See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = { ############################下载器
'testwangyi.middlewares.TestwangyiDownloaderMiddleware': 543,
}
# Configure item pipelines ######开启保存文件模式
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'testwangyi.pipelines.TestwangyiPipeline': 300,
}
步骤4、middlewares中间器设置代理
在第二个class 中有
diergprocess_response
的def函数中添加代理ip和头部信息
from scrapy import signals
import random
class TestwangyiSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class TestwangyiDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider): #设置代理###################################################
# Called with the response returned from the downloader.
user_agent_list = []
######################################设置请求头部
request.headers.setdefault('User-Agent',random.choice(user_agent_list))
#设置代理IP
ipdaili=[]
request.bindaddress = random.choice(ipdaili)#绑定某一个地址
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
步骤5、pipelines存储爬取的内容
import openpyxl
class TestwangyiPipeline(object):
wb=openpyxl.Workbook()
ws=wb.active #激活
ws.append(['标题'])
def process_item(self, item, spider):
#设置变量存储list的形式的值,用来append()
line=[item["title"]]
self.ws.append(line)
self.wb.save("music.xlsx")
# print(item)
return item
步骤6、main创建总函数运行
from scrapy import cmdline
cmdline.execute('scrapy crawl wangyi'.split())
二、Selenium
安装
安装的版本在http://chromedriver.storage.googleapis.com/index.html选择,下载之后,在浏览器所在的位置,直接复制进去。
pip install selenium
1、打开某个网页
from selenium import webdriver
url = 'https://www.baidu.com'
driver = webdriver.Chrome()
driver.get(url)
2、打开网页,获取某个信息
driver = webdriver.Chrome()
driver.find_elements_by_class_name()
driver.find_elements_by_tag_name()
driver.find_elements_by_id()
driver.switch_to.frame( id 或 者 name )
from selenium import webdriver
url="https://movie.douban.com/chart?qq-pf-to=pcqq.group"
driver=webdriver.Chrome()
# driver.maximize_window()
driver.get(url)
tables=driver.find_elements_by_tag_name("table")
tables.pop(0)
for i,v in enumerate(tables):
# name=v.find_elements_by_class_name('pl2')[0].find_elements_by_tag_name("a")[0].text
name=v.find_elements_by_class_name('pl2')[0].find_elements_by_class_name('pl')[0].text
num=v.find_elements_by_class_name('pl2')[0].find_elements_by_class_name('pl')[1].text
score = v.find_elements_by_class_name('pl2')[0].find_elements_by_class_name('pl')[0].text
print(score)
print(num)
# print(li_list)
3打开百度,定位搜索框搜索
from selenium import webdriver
url="http://www.baidu.com"
driver=webdriver.Chrome()
driver.maximize_window() # 全屏
driver.get(url)
element=driver.find_element_by_id('kw')
element.send_keys("python")
driver.find_element_by_id("su").click()
4、打开网页,通过标签定位
from selenium import webdriver
driver=webdriver.Chrome()
driver.get('http://www.runoob.com/python3/python3-file-methods.html')
element = driver.find_element_by_tag_name("title")
target = driver.find_element_by_id("footer")
from selenium.webdriver import ActionChains
action_chains = ActionChains(driver)
action_chains.drag_and_drop(element, target).perform()