版权声明:作者:小白 https://blog.csdn.net/weixin_43687366/article/details/88877996
本篇爬虫主要就是爬取某个网页的数据,并将爬取的数据保存下来!
我这里用的是pycharm软件,python版本是3.7.2,其他的版本没有测试过!
下面直接放代码,内部有部分代码的注释!然后再对每个部分进行分析!
# _*_ coding:utf-8 _*_
"""
作者:king of kasa
内容:爬虫-小练习
加油!keep running!
email:[email protected]
csdn:飞火流星!
date:2019/3/28 16:22
desc:
"""
"""
获取URL
1、python100 的url
2、练习的url
"""
from bs4 import BeautifulSoup
import requests
url = 'http://www.runoob.com/python/python-100-examples.html'
#伪装成浏览器
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36'
}
#发送请求
r = requests.get(url,headers=headers).content.decode('utf-8')
# print(r)
#解析html文档
soup = BeautifulSoup(r,'html.parser') #这里用lxml会出错
# print(type(soup))
#查找每个练习的a链接href属性获取对应的链接地址
re_a = soup.find(id='content').ul.find_all('a') #返回的是100个a标签的列表
#创建一个列表保存url
list = []
for i in re_a:
list.append(i.attrs['href'])
# print(list)
"""
2、根据获取的每个练习的链接地址来请求每个练习获得页面内容
"""
#遍历列表100次
data = []
for x in list:
dict = {}
# 请求详细页面
test = requests.get('http://www.runoob.com' + x, headers=headers).content.decode('utf-8')
# print(test)
# 解析html文档
soup_test = BeautifulSoup(test, 'html.parser')
# print(type(soup_test))
# 查找练习内容
# 查找标题
dict['title'] = soup_test.find(id='content').h1.text
# 查找题目
dict['tm'] = soup_test.find(id='content').find_all('p')[1].text
# print(title)
# 查找程序分析
dict['cxfx'] = soup_test.find(id='content').find_all('p')[2].text
# print(cxfx)
# 程序源代码
try:
dict['code'] = soup_test.find(class_="hl-main").text
except Exception as e:
dict['code'] = soup_test.find('pre').text
# print(code)
# print(dict)
#保存文件1
# data.append(dict)
#
# import pandas as pd
# datas = pd.DataFrame(data)
# datas.to_csv('py-100.csv')
#保存文件2
with open('100-py.csv','a+',encoding='utf-8') as file:
file.write(dict['title']+'\n')
file.write(dict['tm']+'\n')
file.write(dict['cxfx']+'\n')
file.write(dict['code']+'\n')
file.write('*'*50+'\n')
file.write('\n')
本次练习主要分为两步:
-
获取指定页面的url,我这里爬取的是Python 100例的网页;
-
练习获取的url
第一、导入需要的模块
from bs4 import BeautifulSoup
import requests
之前的博客我已经说过了BeautifulSoup
关于requests的讲解,网上资源很多,我看了这篇requests讲解,大家有兴趣的话可以看一下!
第二、url(这个根据自己的情况定)
url = 'http://www.runoob.com/python/python-100-examples.html'
第三、
- 伪装成浏览器
- 发送请求
- 解析html文档
- 查找每个练习的a链接href属性获取对应的链接地址
#伪装成浏览器
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36'
}
#发送请求
r = requests.get(url,headers=headers).content.decode('utf-8')
# print(r)
#解析html文档
soup = BeautifulSoup(r,'html.parser') #这里用lxml会出错
# print(type(soup))
#查找每个练习的a链接href属性获取对应的链接地址
re_a = soup.find(id='content').ul.find_all('a') #返回的是100个a标签的列表
#创建一个列表保存url
list = []
for i in re_a:
list.append(i.attrs['href'])
# print(list)
伪装成浏览器,这里怎么获取呢,看下图:
右击鼠标,选择检查之后,在network中找到对应的headers,直接复制过来就行了!
扫描二维码关注公众号,回复:
5899173 查看本文章
双击第一个文件
下面接着就是查找网页中每个练习的a的链接href属性获取对应的链接地址,红色方框里的程序就是想要获取的
同样的方法右击鼠标查看
可以看到对应的文本存放的位置
#查找每个练习的a链接href属性获取对应的链接地址 re_a = soup.find(id='content').ul.find_all('a') #返回的是100个a标签的列表
如下图:
然后再将获取的url保存下来
#创建一个列表保存url list = [] for i in re_a: list.append(i.attrs['href']) # print(list)
D:\python北风\venv\Scripts\python.exe D:/python北风/爬虫/案例1.py
['/python/python-exercise-example1.html', '/python/python-exercise-example2.html', '/python/python-exercise-example3.html', '/python/python-exercise-example4.html', '/python/python-exercise-example5.html', '/python/python-exercise-example6.html', '/python/python-exercise-example7.html', '/python/python-exercise-example8.html', '/python/python-exercise-example9.html', '/python/python-exercise-example10.html', '/python/python-exercise-example11.html', '/python/python-exercise-example12.html', '/python/python-exercise-example13.html', '/python/python-exercise-example14.html', '/python/python-exercise-example15.html', '/python/python-exercise-example16.html', '/python/python-exercise-example17.html', '/python/python-exercise-example18.html', '/python/python-exercise-example19.html', '/python/python-exercise-example20.html', '/python/python-exercise-example21.html', '/python/python-exercise-example22.html', '/python/python-exercise-example23.html', '/python/python-exercise-example24.html', '/python/python-exercise-example25.html', '/python/python-exercise-example26.html', '/python/python-exercise-example27.html', '/python/python-exercise-example28.html', '/python/python-exercise-example29.html', '/python/python-exercise-example30.html', '/python/python-exercise-example31.html', '/python/python-exercise-example32.html', '/python/python-exercise-example33.html', '/python/python-exercise-example34.html', '/python/python-exercise-example35.html', '/python/python-exercise-example36.html', '/python/python-exercise-example37.html', '/python/python-exercise-example38.html', '/python/python-exercise-example39.html', '/python/python-exercise-example40.html', '/python/python-exercise-example41.html', '/python/python-exercise-example42.html', '/python/python-exercise-example43.html', '/python/python-exercise-example44.html', '/python/python-exercise-example45.html', '/python/python-exercise-example46.html', '/python/python-exercise-example47.html', '/python/python-exercise-example48.html', '/python/python-exercise-example49.html', '/python/python-exercise-example50.html', '/python/python-exercise-example51.html', '/python/python-exercise-example52.html', '/python/python-exercise-example53.html', '/python/python-exercise-example54.html', '/python/python-exercise-example55.html', '/python/python-exercise-example56.html', '/python/python-exercise-example57.html', '/python/python-exercise-example58.html', '/python/python-exercise-example59.html', '/python/python-exercise-example60.html', '/python/python-exercise-example61.html', '/python/python-exercise-example62.html', '/python/python-exercise-example63.html', '/python/python-exercise-example64.html', '/python/python-exercise-example65.html', '/python/python-exercise-example66.html', '/python/python-exercise-example67.html', '/python/python-exercise-example68.html', '/python/python-exercise-example69.html', '/python/python-exercise-example70.html', '/python/python-exercise-example71.html', '/python/python-exercise-example72.html', '/python/python-exercise-example73.html', '/python/python-exercise-example74.html', '/python/python-exercise-example75.html', '/python/python-exercise-example76.html', '/python/python-exercise-example77.html', '/python/python-exercise-example78.html', '/python/python-exercise-example79.html', '/python/python-exercise-example80.html', '/python/python-exercise-example81.html', '/python/python-exercise-example82.html', '/python/python-exercise-example83.html', '/python/python-exercise-example84.html', '/python/python-exercise-example85.html', '/python/python-exercise-example86.html', '/python/python-exercise-example87.html', '/python/python-exercise-example88.html', '/python/python-exercise-example89.html', '/python/python-exercise-example90.html', '/python/python-exercise-example91.html', '/python/python-exercise-example92.html', '/python/python-exercise-example93.html', '/python/python-exercise-example94.html', '/python/python-exercise-example95.html', '/python/python-exercise-example96.html', '/python/python-exercise-example97.html', '/python/python-exercise-example98.html', '/python/python-exercise-example99.html', '/python/python-exercise-example100.html']
Process finished with exit code 0
2、根据获取的每个练习的链接地址来请求每个练习获得页面内容
#遍历列表100次 data = [] #用于保存爬取的程序 for x in list: dict = {} # 请求详细页面 test = requests.get('http://www.runoob.com' + x, headers=headers).content.decode('utf-8') # print(test) # 解析html文档 soup_test = BeautifulSoup(test, 'html.parser') # print(type(soup_test)) # 查找练习内容 # 查找标题 dict['title'] = soup_test.find(id='content').h1.text # 查找题目 dict['tm'] = soup_test.find(id='content').find_all('p')[1].text # print(title) # 查找程序分析 dict['cxfx'] = soup_test.find(id='content').find_all('p')[2].text # print(cxfx) # 程序源代码 #这里的异常处理是因为有一部分练习实例存放的位置不一样,然后进行异常处理,不同的情况不一样,根据自己的情况而定! try: dict['code'] = soup_test.find(class_="hl-main").text except Exception as e: dict['code'] = soup_test.find('pre').text # print(code) # print(dict) #保存文件1 # data.append(dict) # # import pandas as pd # datas = pd.DataFrame(data) # datas.to_csv('py-100.csv') #保存文件2 with open('100-py.csv','a+',encoding='utf-8') as file: file.write(dict['title']+'\n') file.write(dict['tm']+'\n') file.write(dict['cxfx']+'\n') file.write(dict['code']+'\n') file.write('*'*50+'\n') file.write('\n')
所有完成后直接运行程序,就会将数据保存在100-py.csv文件中,然后打开看一下: