import reques

#定义
import requests
url = 'https://www.taobao.com/'

def show_self_type(tx):#打印输入的值和值对应的类型
    print(tx,type(tx))
#1、requests的基本用法:

response = requests.get(url)
print('-----------------网页源码------------------')
show_self_type(response.text)#class='str'
print('-----------------网页状态码------------------')
show_self_type(response.status_code)
print('-----------------Cookies------------------')
show_self_type(response.cookies)
#2、requests添加data
data={
'name':'germet',
'age':22
}
response=requests.get('http://httpbin.org/get',data=data)#或者是params
print(response.text)
#3、requests添加headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N)',
    "host":"image.baidu.com"
}
response = requests.get(url=url,headers=headers)
print(response.text)
#4、requests的json解析
response = requests.get(url='http://httpbin.org/get')#不知道这个有什么用!!!
show_self_type(response.json())

#5、requests保存图片
response=requests.get(url='http://gw.alicdn.com/tps/i3/TB1yeWeIFXXXXX5XFXXuAZJYXXX-210-210.png_50x50.jpg')
with open('taobao.jpg','wb') as f:#'wb'二进制追加
    f.write(response.content)


#6、各种方式请求
r1=requests.get(url)
r2=requests.post(url+'post')
r3=requests.put(url+'put')
r4=requests.delete(url+'delete')
r5=requests.head(url+'get')
r6=requests.options(url+'get')
#7、文件上传,不知道用处
files={'file':open('taobao.jpg','rb')}
response = requests.get('https://httpbin.org/post',files=files)
print(response.text)
#8、浏览器cookies访问
s=requests.Session()
s.get('http://httpbin.org/cookies/set/number/123')
response=s.get('http://httpbin.org/cookies')
print(response.text)
#9、证书验证??
yurl='https://www.12306.cn'
response=requests.get(url,verify=False)
print(response.status_code)
#10、代理设置
proxy={
    'http':'http://127.0.0.1:9743',
    'https':'https://127.0.0.1:9743'
}
response=requests.get('https://www.taobao.com',proxies=proxy)#无代理
print(response.status_code)
  #有用户名和密码
proxy={
    'http':'http://user:[email protected]:9743'
}
response=requests.get('https://www.taobao.com',proxies=proxy)#无代理
  #socks代理
proxy={
    'http':'socks5://127.0.0.1:9743',
    'https':'socks5://127.0.0.1:9743'
}
#11、超时设置
from requests.exceptions import ReadTimeout
try:
    response = requests.get(url,timeout=0.2)
except ReadTimeout:
    print('Timeout')
print(response.status_code)
#12、认证设置??
from requests.auth import HTTPBasicAuth
r=requests.get('http://120.27.32.24:9001',auth=HTTPBasicAuth('user','123'))
print(r.status_code)

#13、异常处理
from requests.exceptions import ReadTimeout,HTTPError,RequestException
try:
    response=requests.get(url,timeout=0.5)
    print(response.status_code)
except ReadTimeout:
    print('TimeOut')
except RequestException:
    print('requestException')
except HTTPError:
    print('HTTPError')
#14、状态码判断:
response=requests.get(url)
if response.status_code==requests.codes.ok:
    print('OK')

这些大概都是只要记住就行了,不需要理解,还是不知道怎么反爬,只会加headers和data,设置代理需要vpn吗?下面大概是bs4和pyquery的总结了,之后就是scrapy框架和分布式爬虫。

猜你喜欢

转载自blog.csdn.net/qq_41375702/article/details/87918778