1.了解http协议原理
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author:love_cat
# HTTP:HyperText Transfer Protocol 超文本传输协议,无状态,互联网的基石
# 两个最重要的单词,request,response
# 客户端---request---服务端,服务端---response---客户端
# -------Request--------
# > GET / HTTP/1.1 StartLine:方法 地址 协议
# > Host:www.bilibili.com Headers
# > User_Agent : curl/7.43.0 请求头
# > Accept: *.* key:value
# ------Response--------
# 200 ok StartLine:状态码 具体解释
# < server:nginx
# < Date: Headers
# < Content-Type:text/html key:value
# <Transfer-Encoding:chunked
# -----MessageBody------
# html,浏览器可以进行渲染
2.简单的小程序
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author:love_cat
import requests
def requests_get():
response = requests.get('http://www.bilibili.com')
# 如果出现乱码加上这一句response.encoding = response.apparent_encoding
print(response.text)
#-----执行函数,输出结果--------
# <!DOCTYPE html><html lang="zh-Hans"><head><meta charset="utf-8"><title>哔哩哔哩 (゜-゜)つロ 干杯~-·······
def requests_get_para():
param = {'para1':'vedio','para2':'av19420709'}
response = requests.get('http://www.bilibili.com',params=param)
print('>>>response headers:')
print(response.headers)
print('>>>status code:')
print(response.status_code)
print(response.reason)
# -----执行函数,输出结果--------
# >>>response headers:
# {'Date': 'Wed, 16 May 2018 13:33:00 GMT', 'Content-Type': 'text/html; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'gear': '1', 'vikingrCache': '60000', 'Vikingr-Cache-TTL': '54949', 'Vary': 'Origin,Accept-Encoding', 'Content-Encoding': 'gzip', 'Expires': 'Wed, 16 May 2018 13:33:30 GMT', 'Cache-Control': 'max-age=30', 'X-Cache': 'HIT from ks-bj6-webcdn-03.hdslb.com'}
# >>>status code:
# 200
# OK
3.请求方法
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author:love_cat
# get:查看资源
# post:增加资源
# put:修改资源
# delete:删除资源
# head:查看响应头
# options:查看可用请求方法
import requests
def get():
response = requests.get('http://www.bilibili.com')
print(response.request.headers)
print(response.request.body)
print(response.url)
# ----执行函数,输出结果
# {'User-Agent': 'python-requests/2.18.4', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
# None
# https://www.bilibili.com/
def post():
url = 'http://www.bilibili.com'
response = requests.post(url,
data={'k1':'v1','k2':'v2'})
print(response.request.headers)
print(response.request.body)
print(response.status_code)
print(response.text)
# ----执行函数,输出结果
# {'User-Agent': 'python-requests/2.18.4', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
# None
# 200
# <!DOCTYPE html><html lang="zh-Hans"><head><meta charset="utf-8"><title>哔哩哔哩 (゜-゜)つロ 干杯~-bilibili</title><······
def Json():
import json
url = 'http://www.bilibili.com'
d = json.dumps({'k1':'v1','k2':'v2'})
response = requests.post(url,
data=d
)
print(response.request.headers)
print(response.request.body)
print(response.status_code)
print(response.text)
# ----执行函数-----
# {'User-Agent': 'python-requests/2.18.4', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
# None
# 200
# <!DOCTYPE html><html lang="zh-Hans"><head><meta charset="utf-8"><title>哔哩哔哩 (゜-゜)つロ 干杯~-bili
4.请求异常处理
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author:love_cat
import requests
from requests import exceptions
def timeoue_request():
url = 'http://www.google.com' # 没有VPN,所以会超时
try:
# timeout可以是一个元组,三次握手,第一次最多等待五秒,第二次最多等待七秒。
# timeout也可以是一个数字,则两次握手都等待相同的时间
response = requests.get(url,timeout=(5,7))
except exceptions.Timeout as e:
print(e)
else:
print('正常访问')
# ----执行函数,输出结果-----
# HTTPConnectionPool(host='www.google.com', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x000000000AF46128>, 'Connection to www.google.com timed out. (connect timeout=5)'))
def timeoue_request():
url = 'http://www.bilibili.com' # 访问bilibili,五秒内肯定可以访问成功,因此不会超时
try:
# timeout可以是一个元组,三次握手,第一次最多等待五秒,第二次最多等待七秒。
# timeout也可以是一个数字,则两次握手都等待相同的时间
response = requests.get(url,timeout=(5,7))
except exceptions.Timeout as e:
print(e)
else:
print('正常访问')
# ----执行函数,输出结果----
# 正常访问
5.自定义requests
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author:love_cat
import requests
def session_requests():
session = requests.Session()
res = session.get('http://www.bilibili.com')
print(session.cookies)
print(res.text)
# ----执行函数,输出结果---
# <RequestsCookieJar[]>
# <!DOCTYPE html><html lang="zh-Hans"><head><meta charset="utf-8"><title>哔哩哔哩 (゜-゜)つロ 干杯~-bilibili······
# 一般来说,requests模拟登陆分为三步
# 1.session.get(url),先登录访问网站的主页面,否则访问网站会认为你是在模拟登陆
# 2.session.post(),根据form_data,提交对应用户名密码等参数
# 3.sess.post(),进行操作
6.响应基本api
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author:love_cat
import requests
# 状态码
# 2xx:正常
# 3xx:重定向
# 4xx:客户端错误,访问错误,找不到等等
# 5xx:服务器问题
res = requests.get('http://api.github.com')
print(res.status_code)
print(res.reason)
print(res.headers)
print(res.history) # http会自动转到https
# ----执行程序,输出结果
# 200
# OK
# {'Date': 'Wed, 16 May 2018 14:40:51 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Server': 'GitHub.com', 'Status': '200 OK', 'X-RateLimit-Limit': '60', 'X-RateLimit-Remaining': '59', 'X-RateLimit-Reset': '1526485251', 'Cache-Control': 'public, max-age=60, s-maxage=60', 'Vary': 'Accept, Accept-Encoding', 'ETag': 'W/"7dc470913f1fe9bb6c7355b50a0737bc"', 'X-GitHub-Media-Type': 'github.v3; format=json', 'Access-Control-Expose-Headers': 'ETag, Link, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval', 'Access-Control-Allow-Origin': '*', 'Strict-Transport-Security': 'max-age=31536000; includeSubdomains; preload', 'X-Frame-Options': 'deny', 'X-Content-Type-Options': 'nosniff', 'X-XSS-Protection': '1; mode=block', 'Referrer-Policy': 'origin-when-cross-origin, strict-origin-when-cross-origin', 'Content-Security-Policy': "default-src 'none'", 'X-Runtime-rack': '0.010773', 'Content-Encoding': 'gzip', 'X-GitHub-Request-Id': '0579:6FCC:B556E:EE3C3:5AFC42F2'}
# [<Response [301]>]
7.下载图片
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author:love_cat
import requests
from bs4 import BeautifulSoup
import uuid
def get_pic(url):
# 一般伪造一个headers,requests.get(url,headers=xx)
response = requests.get(url)
text = response.text
soup = BeautifulSoup(text,features='html.parser')
all_img = soup.find_all('img',attrs={'class':'BDE_Image'})
for img in all_img:
pic_url = img.attrs.get('src') # 获取图片的链接
res = requests.get(pic_url) # 再次访问
content = res.content # 得到图片的二进制内容
file_name = str(uuid.uuid4())+'.jpg'
with open(file_name,'wb') as f:
f.write(content) # 将图片写进去
print(f'{file_name}写入成功')
get_pic('https://tieba.baidu.com/p/5524106374?fr=ala0&pstaala=3&tpl=5&fid=2836434&red_tag=2727536315')
# ----执行程序,输出结果----
# 8488cddd-9475-4a72-b2f2-deb8c4edc18b.jpg写入成功
# b1db5189-33bb-48a3-800e-3e9d15784669.jpg写入成功
# b5d17542-ed73-44f7-91de-aa59159e34c8.jpg写入成功
# 50267154-b856-46bc-a50c-c0fb5528aaa9.jpg写入成功
# bd6dec6f-1d71-4204-9c44-29143d79bf4e.jpg写入成功
# 7fa7c6b4-ee51-43b8-8efc-06b02f4ca0e1.jpg写入成功
# 2cd0bf56-9f81-41c1-9cde-74a372f8069b.jpg写入成功
# d01fe550-a790-4315-8b96-15e2e27f1901.jpg写入成功
# 6139a78a-f4bb-4b4e-9d88-47b0b1653969.jpg写入成功
# 6a2d09a2-037f-4043-86c3-66ff139db2d2.jpg写入成功
8.事件钩子
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author:love_cat
import requests
def callback(response,*args,**kwargs):
print(response.headers)
def get():
url = 'http://www.bilibili.com'
# 添加一个回调函数,hooks里面的key为response,当请求完成后,会调用callback
# 并将request.get(xxx)的返回值,也就是返回结果传给callback函数中的response参数
requests.get(url,hooks={'response':callback})
# ----执行函数,输出结果----
# {'Server': 'Tengine', 'Date': 'Wed, 16 May 2018 16:03:47 GMT', 'Content-Type': 'text/html', 'Content-Length': '278', 'Connection': 'keep-alive', 'Location': 'https://www.bilibili.com/'}
# {'Date': 'Wed, 16 May 2018 16:03:47 GMT', 'Content-Type': 'text/html; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'gear': '1', 'vikingrCache': '60000', 'Vikingr-Cache-TTL': '43563', 'Vary': 'Origin,Accept-Encoding', 'Content-Encoding': 'gzip', 'Expires': 'Wed, 16 May 2018 16:04:17 GMT', 'Cache-Control': 'max-age=30', 'X-Cache': 'HIT from cn-sdqd-cu-w-03.hdslb.com'}
9.自动登录伯乐在线
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author:love_cat
import requests
post_dict = {
'action':'user_login',
'user_login':'xxx',
'user_pass':'xxx',
'remember_me':1,
'redirect_url':'http://www.jobbole.com/'
}
response = requests.post(
url='http://www.jobbole.com/wp-admin/admin-ajax.php',
data=post_dict,
)
print(response.text)
cookie_dict = response.cookies.get_dict() # 获取登录成功之后返回的cookies,这里我们登录失败了
print(cookie_dict)
# -----执行程序,输出结果-----
# {"jb_result":-1,"jb_msg":"\u7528\u6237\u540d\u6216\u5bc6\u7801\u9519\u8bef"}
# {}
requests.get(url='http://www.jobbole.com/',
cookies=cookie_dict) # 这里再次进行get请求时,如果带上登陆成功后返回的cookies,获取的页面会有个人的账号信息
10.requests模块总结
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author:love_cat
# --参数
# --url:提交地址
# --params:在url中传递参数,GET
# requests.request(
# method = 'get',
# url = 'http://www.bilibili.com',
# params = {'k1':'v1','k2':'v2'}
# )
# http://www.bilibili.com?k1=v1&k2=v2
# --data:在请求体里传递的数据,可以为字典,字节,或者文件对象
# requests.request(
# method = 'post',
# url = 'http://www.bilibili.com',
# params = {'k1':'v1','k2':'v2'},
# # data = {'user':'satori','password':'123'}
# data = "user=satori&password=123" # 本质上会转成这个
# )
# 会自动封装成请求头和请求体
# 请求头:
# content-type:application/url-form-encoding
# 请求体:
# user=satori&password=123
# --json:在请求体里传递的数据
# requests.request(
# method = 'post',
# url = 'http://www.bilibili.com',
# params = {'k1':'v1','k2':'v2'},
# json = {'user':'satori','password':'123'}
# ) # json = "{'user':'satori','password':'123'}"
# 会自动封装成请求头和请求体
# 请求头:
# content-type:application/json
# 请求体:
# "{'user':'satori','password':'123'}"
#
# data和json一般都能做,但是用data不可以字典嵌套字典,json可以
# --headers:请求头
# requests.request(
# method = 'post',
# url = 'http://www.bilibili.com',
# params = {'k1':'v1','k2':'v2'},
# json = {'user':'satori','password':'123'}
# headers={'Referer':'http://www.bilibili.com',
# 'User-Agent':'chrome':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'}
# ) # 加上referer是为了告诉访问网站,来自于哪儿。如果没有进入网站,那就不可能登录,所以不加,很可能会被识别为模拟登陆
# # User-Agent则是告诉访问网站,自己是用浏览器访问的。也可以伪造为其他的浏览器信息。
# --cookies:带上自己的信息
# requests.get(url='http://www.jobbole.com/',
# cookies=cookie_dict)
# --timeout:超时时间
# 如果为元祖,那么分别对应请求和返回的超时时间
# 如果为数字,那么均为请求和返回的超时时间
# --allow_redirects:允许重定向
# --proxies:代理
# requests.post(url='http://www.jobbole.com/',
# proxies={'http':'http://1.1.1.1:8000'}
# )
# # 我们的请求不会发给url,而是发给代理,然后代理帮我们转发给url
# --stream:流
# 一般情况下,下载文件,先把文件全部读取到内存中,然后写入硬盘
# 如果文件大于内存,便可以使用流的方式,一边读取到内存,一边写入硬盘
# --files:上传文件
# --verify:是否忽略证书
# --session:保存客户端历史信息
# session = requests.Session()
# session.get(url) # 登录任意页面,获取cookie
# session.post(url,data) # 用户登录,携带上一次的cookies
# session.post(url,data) # 操作