import requests
r = requests.get(url)
r.content.decode()
或
r.text()
发送带header的请求
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"}
r = requests.get(url, headers=headers)
发送带params的请求
p = {"wd": "python"}
r = requests.get(url,params=kw)
或者
url = “www.baidu.com/s?wd={}”.format(“python”)
r = requests.get(url)
发送post请求
self.data = {
"query": "人生",
"from": "zh",
"to": "en",
"token": "3382b43f5bd30a8207f823d122f13b36",
"sign": "548627.834594"
}
self.headers = {
"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"}
r = requests.post(self.url, data=self.data, headers=self.headers)
使用代理
为什么爬虫需要使用代理?
让服务器以为不是同一个客户端在请求
防止我们的真实地址被泄漏
import requests
url = "http://httpbin.org/ip"
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"}
proxies = {"http": "http://119.3.37.101:8058"}
r = requests.get(url, headers=headers, proxies=proxies)
print(r.text)
使用session保存登陆状态
import requests
url = "登陆界面的url"
data = {"username": "", "password": ""}
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"}
session = requests.session()
session.post(url, headers=headers, data=data)
r = session.get('登陆后能访问的页面')