爬虫基础实例框架
无针对网页爬虫限制的一般网页爬取
import requests
def http_get_text(url): # 参数为网页链接
try:
r = requests.get(url)
r.raise_for_status() #检测r.status_code是否为200,进而抛出异常
r.encoding = r.apparent_encoding
return r.text[:1000]
except:
return '爬取失败'
亚马逊网页对爬虫请求拒绝
通过定制request对象头部信息使爬虫请求转为浏览器请求
# 一般未设置的request请求头部信息中{'User-Agent': 'python-requests/2.22.0'}
def http_get_text(url): # 参数为网页链接
try:
kv = {'user-agent':'Mozilla/5.0'} # 爬虫请求改为浏览器请求
r = requests.get(url, headers=kv) # 定制HTTP头
print(r.request.headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text[:1000]
except:
return '爬取失败'
百度搜索关键词提交
百度搜索引擎关键词提交接口
http://www.baidu.com/s?wd=keyword(搜索关键词)
import requests
def http_search(keyword): # 参数为关键词
try:
kv = {'wd':keyword}
url = 'http://www.baidu.com/s?'
r = requests.get(url, params=kv)
r.raise_for_status()
# r.request.url request对象请求url
r.encoding = r.apparent_encoding
return r.text[:1000]
except:
return '爬取失败'
网络图片的爬取和存储
import requests
import os
def get_image(image_url): # 参数为图片地址
try:
root = 'F:/Python爬取图片/' # 图片保存文件夹
path = root + image_url.split('/')[-1] # 保存image_url图片名字,图片保存路径path
if not os.path.exists(root): # 不存在文件夹则创建
os.mkdir(root)
if not os.path.exists(path): # 下载图片
r = requests.get(image_url)
r.raise_for_status()
with open(path, 'wb') as f: # 将图片二进制信息写入文件
f.write(r.content)
f.close()
print('图片保存成功')
except:
return '爬取图片失败'
get_image('https://ss1.bdstatic.com/70cFvXSh_Q1YnxGkpoWK1HF6hhy/it/u=1997646403,2041715424&fm=26&gp=0.jpg') #爬取Python图片
ip地址归属地的自动查询
import requests
def get_ip_search(ipaddress): # 参数为ip地址
try:
url = 'http://m.ip138.com/ip.asp?ip='
r = requests.get(url+ipaddress)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text[-500:]
except:
return '查询失败'
print(get_ip_search('202.204.80.112'))
bs4 + requests 定向爬取网页指定html信息
import bs4
import requests
def get_html_text(url): # 获取网站资源
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ''
def download_list(ulist, html): # 下载资源到ulist列表
soup = bs4.BeautifulSoup(html, 'html.parser')
for tr in soup.find('tbody').children:
if isinstance(tr, bs4.element.Tag):
tds = tr('td')
ulist.append([tds[0].string, tds[1].string, tds[3].string])
def print_list(ulist, num):
f = '{0:^5}\t{1:{3}^10}\t{2:^10}' #格式控制
print('爬取软科中国最好大学排名2016')
print(f.format('排名','学校名称','总分', chr(12288))) # 使用中文空格填充
for i in range(num):
print(f.format(ulist[i][0], ulist[i][1], ulist[i][2], chr(12288)))
def main():
ulist = []
url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html'
html = get_html_text(url)
download_list(ulist, html)
print_list(ulist, 242)
if __name__ == '__main__':
main()
Re标准库的使用
bs4 + requests + re 爬取百度股票信息
import bs4
from bs4 import BeautifulSoup
import requests
import re
def get_html_text(url): # 下载url上html资源
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ''
def get_stock_list(lst, stock_url):
html = get_html_text(stock_url)
soup = BeautifulSoup(html, 'html.parser') # 证券列表html
a = soup.find_all('a') # 查找a标签
for i in a:
try:
href = i.attrs['href']
lst.append(re.findall(r'[s][hz]\d{6}', href)[0]) # re搜索找到证券编码存储到lst
except:
continue
def get_stock_info(lst, stock_url, fpath):
for stock in lst:
url = stock_url + stock + '.html' # 指定证券的url
html = get_html_text(url) # 下载html资源
try:
if html == '':
continue
info_dict = {}
soup = BeautifulSoup(html, 'html.parser')
stockInfo = soup.find('div', attrs={'class':'stock-bets'}) # name在div的属性中
name = stockInfo.find_all(attrs={'class':'bets-name'})[0] # 获得股票名称
info_dict.update({'股票名称':name.text.split()[0]})
keylist = stockInfo.find_all('dt') # 获取更多信息的键
valuelist = stockInfo.find_all('dd') # 获取值
if len(keylist) == 0:
continue
for i in range(len(keylist)):
key = keylist[i].text
val = valuelist[i].text
info_dict[key] = val # 写入数据
with open(fpath, 'a', encoding='utf-8') as f:
f.write(str(info_dict) + '\n') # 写入文件
f.close()
except:
continue
def main():
stock_list_url = 'http://quote.eastmoney.com/stock_list.html' # 证券列表编号
stock_info_url = 'https://gupiao.baidu.com/stock/' # 百度股票共同的url
output_file = 'F:/PythonCrawler/baidustock.txt' # 保存文件
stock_list = [] # 证券列表
get_stock_list(stock_list, stock_list_url) # 获取证券列表
get_stock_info(stock_list, stock_info_url, output_file) # 下载证券信息
if __name__ == '__main__':
main()
Scrapy爬虫框架
产生步骤:
- 建立一个Scrapy爬虫工程
- 在工程中产生一个Scrapy爬虫
- 配置产生的Spider爬虫
- 运行Spider爬虫
建立Scrapy爬虫工程
产生Scrapy爬虫
配置Spider爬虫
demo.py
配置如下
import scrapy
class DemoSpider(scrapy.Spider):
name = "demo"
#allowed_domains = ["python123.io"]
def start_requests(self):
urls = \
[
'https://python123.io/ws/demo.html' #爬取的网站链接
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse) #发送请求
# yield生成器的使用,每次产生一个值,函数冻结被唤醒之后再产生一个值
def parse(self, response):
fname = response.url.split('/')[-1] #写入文件
with open(fname, 'wb') as f:
f.write(response.body)
self.log('Saved file %s.' % name)