Python标准库中提供了:urllib、urllib2、httplib等模块以供Http请求,但是,它的 API 太渣了。它是为另一个时代、另一个互联网所创建的。它需要巨量的工作,甚至包括各种方法覆盖,来完成最简单的任务。
Requests 是使用 Apache2 Licensed 许可证的 基于Python开发的HTTP 库,其在Python内置模块的基础上进行了高度的封装,从而使得Pythoner进行网络请求时,变得美好了许多,使用Requests可以轻而易举的完成浏览器可有的任何操作。
入门实例
import requests
from bs4 import BeautifulSoup
# 1. 下载页面
ret = requests.get(url='https://www.autohome.com.cn/news/')
ret.encoding = ret.apparent_encoding
# print(ret.text)
# 2. 解析:获取想要的指定内容beautifulsoup
soup = BeautifulSoup(ret.text,'html.parser') # lxml
div = soup.find(name='div',id='auto-channel-lazyload-article')
li_list = div.find_all(name='li')
for li in li_list:
h3 = li.find(name='h3')
#有的是None
if not h3:
continue
p = li.find(name='p')
a = li.find('a')
print(h3.text, a.get('href'))
print(p.text)
print('=' * 25)
輸出如下:
=========================
内外焕然一新 全新长安CS35谍照曝光 //www.autohome.com.cn/news/201805/917066.html#pvareaid=102624
[汽车之家 国内谍照] 日前,我们从汽车拍客阿睿的微博处获取到了一组长安全新CS35的路试谍照。根据此前信息来看,新车将会在今年下半年正式上市销售。...
=========================
涉及7款新车型 北汽幻速公布产品规划 //www.autohome.com.cn/news/201805/917062.html#pvareaid=102624
[汽车之家 新闻] 日前,我们从北汽幻速官方获悉,其未来将推出全新“X”系列车型,并透露了其未来在新能源车型的布局以及将在未来推出一款全新MPV车型...
=========================
下载图片
import requests
from bs4 import BeautifulSoup
# 1. 下载页面
ret = requests.get(url='https://www.autohome.com.cn/news/')
ret.encoding = ret.apparent_encoding
# print(ret.text)
# 2. 解析:获取想要的指定内容beautifulsoup
soup = BeautifulSoup(ret.text,'html.parser') # lxml
div = soup.find(name='div',id='auto-channel-lazyload-article')
li_list = div.find_all(name='li')
for li in li_list:
h3 = li.find(name='h3')
#有的是None
if not h3:
continue
img = li.find('img')
#拿到圖片的url
src = img.get('src')
#後綴文件名
file_name = src.rsplit('__',maxsplit=1)[1]
#獲取網絡圖片
ret_img = requests.get(
url="https:" + src
)
#寫入文件
with open(file_name,'wb') as f:
f.write(ret_img.content)
1、网站反爬虫
r1 = requests.get(
url='https://dig.chouti.com/all/hot/recent/1',
)
print(r1.text)
输出如下:会提示网站防火墙
<html xmlns="http://www.w3.org/1999/xhtml"><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<title>网站防火墙</title>
<style>
p {
line-height:20px;
}
2、伪造request请求
r1 = requests.get(
url='https://dig.chouti.com/all/hot/recent/1',
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}
)
print(r1.text)
添加headers,就可以爬取网站数据了
3、伪造登陆
response_login = requests.post(
url='https://dig.chouti.com/login',
data={
'phone':'XXXXX', 'password':"XXXXX", 'oneMonth':'1' },
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' },
)
print(response_login.text)
输出如下:
{"result":{"code":"9999", "message":"", "data":{"complateReg":"0","destJid":"cdu_49803354421"}}}
4、错误的伪造点赞
response_login = requests.post(
url='https://dig.chouti.com/login',
data={
'phone':'顺丰到付', 'password':"十分到", 'oneMonth':'1' },
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' },
)
r1_cookie_dict = response_login.cookies.get_dict()
ret = requests.post(
url="https://dig.chouti.com/link/vote?linksId=19329006",
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' },
cookies=r1_cookie_dict
)
print(ret.text)
以上是错误的,不能进行点赞
5、伪造点赞调试
点赞操作,网站cookie
Cookie: gpsd=9f1955c3dc27771fd2c1400a4d210b7d; JSESSIONID=aaaaCRNkvJO6KdAwZ6fmw; route=340ad5ec7bdbaaaa2d4d12be04eae5d2; gpid=9ed52f3ca2b64e49996d64cc9c86827a; _pk_ref.1.a2d5=%5B%22%22%2C%22%22%2C1525683488%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DuR2E6enPPD46TEft2mcKxrSqHCpayNgKIXfIAAEQKQuiypGVXfAdhuXBWfHubt-K%26wd%3D%26eqid%3Db497a0aa00025657000000035af01516%22%5D; _pk_ses.1.a2d5=*; puid=cdu_49803354421; puid=cb55d2189f99a8d3a19c7441d09affe5; _pk_id.1.a2d5=d0e24c0126d32bf5.1525683488.1.1525687100.1525683488.
ret = requests.post(
url="https://dig.chouti.com/link/vote?linksId=19329006",
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' },
# cookies=r1_cookie_dict
cookies={
'gpsd': '9f1955c3dc27771fd2c1400a4d210b7d', 'gpid': '9ed52f3ca2b64e49996d64cc9c86827a', }
)
print(ret.text)
输出如下:
{"result":{"code":"9999", "message":"推荐成功", "data":{"jid":"cdu_49803354421","likedTime":"1525687086616000","lvCount":"10","nick":"似懂非懂发","uvCount":"508","voteTime":"小于1分钟前"}}}
6、通过代码进行点赞
import requests
from bs4 import BeautifulSoup
# 1. 先访问抽屉新热榜,获取cookie(未授权)
r1 = requests.get(
url='https://dig.chouti.com/all/hot/recent/1',
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}
)
# print(r1.text)
r1_cookie_dict = r1.cookies.get_dict()
#
# # 2. 发送用户名和密码认证 + cookie(未授权)
response_login = requests.post(
url='https://dig.chouti.com/login',
data={
'phone': '顺丰顺丰的',
'password': "随风倒十分",
'oneMonth': '1'
},
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
},
cookies = r1_cookie_dict
)
ret = requests.post(
url="https://dig.chouti.com/link/vote?linksId=19329006",
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
},
cookies=r1_cookie_dict
)
print(ret.text)
输出如下:
{"result":{"code":"9999", "message":"推荐成功", "data":{"jid":"cdu_49803354421","likedTime":"1525687678292000","lvCount":"14","nick":"水水水水水","uvCount":"508","voteTime":"小于1分钟前"}}}
7、批量点赞
import requests
from bs4 import BeautifulSoup
# 1. 先访问抽屉新热榜,获取cookie(未授权)
r1 = requests.get(
url='https://dig.chouti.com/all/hot/recent/1',
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}
)
r1_cookie_dict = r1.cookies.get_dict()
#
# # 2. 发送用户名和密码认证 + cookie(未授权)
response_login = requests.post(
url='https://dig.chouti.com/login',
data={
'phone': '算法大是大非',
'password': "士大夫似的",
'oneMonth': '1'
},
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
},
cookies = r1_cookie_dict
)
for page_num in range(1,2):
response_index = requests.get(
url='https://dig.chouti.com/all/hot/recent/%s' %page_num,
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
)
soup = BeautifulSoup(response_index.text,'html.parser')
div = soup.find(attrs={'id':'content-list'})
items = div.find_all(attrs={'class':'item'})
for item in items:
tag = item.find(attrs={'class':'part2'})
nid = tag.get('share-linkid')
# 根据每一个新闻ID点赞
r1 = requests.post(
url='https://dig.chouti.com/link/vote?linksId=%s' %nid,
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
},
cookies=r1_cookie_dict
)
print(r1.text)
输出如下:
E:\python\python_sdk\python.exe E:/python/py_dev/python/day132/2.登录抽屉.py
{"result":{"code":"9999", "message":"推荐成功", "data":{"jid":"cdu_49803354421","likedTime":"1525688272054000","lvCount":"13","nick":"士大夫","uvCount":"509","voteTime":"小于1分钟前"}}}
{"result":{"code":"9999", "message":"推荐成功", "data":{"jid":"cdu_49803354421","likedTime":"1525688273598000","lvCount":"10","nick":"士大夫","uvCount":"510","voteTime":"小于1分钟前"}}}
{"result":{"code":"30010", "message":"你已经推荐过了", "data":""}}
{"result":{"code":"30010", "message":"你已经推荐过了", "data":""}}
{"result":{"code":"30010", "message":"你已经推荐过了", "data":""}}
{"result":{"code":"9999", "message":"推荐成功", "data":{"jid":"cdu_49803354421","likedTime":"1525688280154000","lvCount":"18","nick":"顺丰顺丰的","uvCount":"511","voteTime":"小于1分钟前"}}}
{"result":{"code":"9999", "message":"推荐成功", "data":{"jid":"cdu_49803354421","likedTime":"1525688282204000","lvCount":"48","nick":"所发生的","uvCount":"512","voteTime":"小于1分钟前"}}}
{"result":{"code":"30010", "message":"你已经推荐过了", "data":""}}
{"result":{"code":"30010", "message":"你已经推荐过了", "data":""}}
{"result":{"code":"30010", "message":"你已经推荐过了", "data":""}}
{"result":{"code":"30010", "message":"你已经推荐过了", "data":""}}
{"result":{"code":"30010", "message":"你已经推荐过了", "data":""}}
{"result":{"code":"30010", "message":"你已经推荐过了", "data":""}}
{"result":{"code":"30010", "message":"你已经推荐过了", "data":""}}
{"result":{"code":"30010", "message":"你已经推荐过了", "data":""}}
{"result":{"code":"30010", "message":"你已经推荐过了", "data":""}}
{"result":{"code":"30010", "message":"你已经推荐过了", "data":""}}
{"result":{"code":"30010", "message":"你已经推荐过了", "data":""}}
{"result":{"code":"30010", "message":"你已经推荐过了", "data":""}}
{"result":{"code":"30010", "message":"你已经推荐过了", "data":""}}
{"result":{"code":"30010", "message":"你已经推荐过了", "data":""}}
{"result":{"code":"30010", "message":"你已经推荐过了", "data":""}}
{"result":{"code":"30010", "message":"你已经推荐过了", "data":""}}
{"result":{"code":"30010", "message":"你已经推荐过了", "data":""}}
{"result":{"code":"30010", "message":"你已经推荐过了", "data":""}}
Process finished with exit code 0
8、取消点赞
只需要成如下代码即可
for page_num in range(1,5):
response_index = requests.get(
url='https://dig.chouti.com/all/hot/recent/%s' %page_num,
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
)
soup = BeautifulSoup(response_index.text,'html.parser')
div = soup.find(attrs={'id':'content-list'})
items = div.find_all(attrs={'class':'item'})
for item in items:
tag = item.find(attrs={'class':'part2'})
nid = tag.get('share-linkid')
# 根据每一个新闻ID点赞
r1 = requests.post(
# url='https://dig.chouti.com/link/vote?linksId=%s' %nid,
url='https://dig.chouti.com/vote/cancel/vote.do',
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
},
data={
'linksId': nid,
},
cookies=r1_cookie_dict
)
print(r1.text)
模拟登陆github
import requests
from bs4 import BeautifulSoup
r1 = requests.get(
url='https://github.com/login'
)
s1 = BeautifulSoup(r1.text, 'html.parser')
token = s1.find(name='input', attrs={'name': 'authenticity_token'}).get('value')
print(token)
r1_cookie_dict = r1.cookies.get_dict()
r2 = requests.post(
url='https://github.com/session',
data={
'commit': 'Sign in',
'utf8': '✓',
'authenticity_token': token,
'login': '[email protected]',
'password': 'xxx*+'
},
headers={
'Host': 'github.com',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
},
cookies=r1_cookie_dict
)
print(r2.text)