PycharmProject下载:https://download.csdn.net/download/lly1122334/10419435
urllib是Python3的内置库,提供了一系列操作URL的功能
1_1 urllib 三大模块
#coding=utf-8
import socket
import urllib.request
import urllib.error
import urllib.parse
'''
urllib是Python内置的Http请求库
urllib.request 请求模块
urllib.error 异常处理模块
urllib.parse url解析模块
urllib.robotparser robots.txt解析模块(很少用)
'''
#urllib.request 请求模块
response = urllib.request.urlopen("http://www.baidu.com")
#print(response.read().decode("utf-8"))
#urllib.parse url解析模块
data = bytes(urllib.parse.urlencode({'word':'hello'}),encoding='utf8')#请求httpbin.org,该网址用于做HTTP测试
response = urllib.request.urlopen("http://httpbin.org/post",data=data)
#print(response.read())
#urllib.error 异常处理模块
response = urllib.request.urlopen("http://httpbin.org/get",timeout=1)#超时参数timeout=1可达到
#print(response.read())
try:
response = urllib.request.urlopen("http://httpbin.org/get",timeout=0.1)#超时参数timeout=0.1不可达到
except urllib.error.URLError as e:
if isinstance(e.reason,socket.timeout):
print("TIME OUT")
else:
print("OK")
1_2 urllib 响应
#coding=utf-8
import urllib.request
#响应:响应类型、状态码、响应头
#响应类型
response = urllib.request.urlopen("http://www.baidu.com")
print(type(response))
#状态码:判断请求是否成功
print(response.status)
#响应头
print(response.getheaders())
1_3 urllib Request
#coding=utf-8
from urllib import request,parse
# Request:向网页发出复杂请求
url = 'http://httpbin.org/post'
headers = {
'User-Agent':'Mozilla/4.0(compatible;MSIE 5.5;Windows NT)',
'Host':'httpbin.org'
}
dict = {
'name':'Germey'
}
data = bytes(parse.urlencode(dict),encoding='utf8')
req = request.Request(url=url,data=data,headers=headers,method='POST')
response = request.urlopen(req)
print(response.read().decode('utf-8'))
1_4 urllib Handler
#coding=utf-8
import urllib.request
import http.cookiejar
#Handler:代理、Cookie
#代理
proxy_handler = urllib.request.ProxyHandler({
'http':'http://127.0.0.1:80', #将端口号改为代理的端口号
'https':'https://127.0.0.1:80' #将端口号改为代理的端口号
})
opener = urllib.request.build_opener(proxy_handler)
#response = opener.open('http://www.baidu.com')
#print(response.read())
#Cookie:维持登录会话信息
cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
for item in cookie:
print(item.name+"="+item.value)
#Cookie_保存到本地文件,方便下次请求时调用
filename1 = "1_4_1 urllib Handler cookie1.txt"
cookie = http.cookiejar.MozillaCookieJar(filename1)#Mozilla模式保存
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)#ignore_discard,即使cookies将被丢弃也保存下来。ignore_expires,如果在该文件cookies已存在,覆盖写入
filename2 = "1_4_2 urllib Handler cookie2.txt"
cookie = http.cookiejar.LWPCookieJar(filename2)#LWP模式保存
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)
#Cookie_加载本地Cookie将其放到Request中请求
cookie = http.cookiejar.LWPCookieJar()
cookie.load('1_4_2 urllib Handler cookie2.txt',ignore_discard=True,ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
#print(response.read().decode('utf-8'))
1_5 urllib 异常处理
#coding=utf-8
from urllib import request, error
import socket
#异常处理
try:
response = request.urlopen('http://cuiqingcai.com/index.htm')#一个不存在的网页
except error.HTTPError as e:
print(e.reason,e.code,e.headers,sep='\n')
except error.URLError as e:
print(e.reason)
else:
print('Request Successfully')
try:
response = request.urlopen('http://www.baidu.com',timeout=0.01)
except error.URLError as e:
print(type(e.reason))
if isinstance(e.reason,socket.timeout):
print('TIME OUT')
1_6 urllib URL解析
#coding=utf-8
import urllib.parse
#原型:urllib.parse.urlparse(urlstring, scheme='', allow_fragments=True)
#URL解析:urlparse
print('URL解析:urlparse')
result = urllib.parse.urlparse('http://www.baidu.com/index.html;user?id=5#comment')
print(type(result),result)
result = urllib.parse.urlparse('www.baidu.com/index.html;user?id=5#comment',scheme='https')#自动填充协议类型
print(result)
result = urllib.parse.urlparse('http://www.baidu.com/index.html;user?id=5#comment',scheme='https')#若原本有协议类型则不填充
print(result)
result = urllib.parse.urlparse('http://www.baidu.com/index.html;user?id=5#comment',allow_fragments=False)#将锚点链接拼接到前面
print(result)
result = urllib.parse.urlparse('http://www.baidu.com/index.html#comment',allow_fragments=False)#将锚点链接拼接到前面
print(result)
#URL反解析(拼接):urlunparse
print('\nURL反解析(拼接):urlunparse')
data = ['http','www.baidu.com','index.html','user','a=6','comment']
print(urllib.parse.urlunparse(data))#http://www.baidu.com/index.html;user?a=6#comment
#URL填充(后者填充前者):urljoin
print('\nURL填充(后者填充前者):urljoin')
print(urllib.parse.urljoin('http://www.baidu.com','FAQ.html'))#http://www.baidu.com/FAQ.html
print(urllib.parse.urljoin('http://www.baidu.com','https://XerCis.com/FAQ.html'))#https://XerCis.com/FAQ.html
print(urllib.parse.urljoin('http://www.baidu.com/about.html','https://XerCis.com/FAQ.html'))#https://XerCis.com/FAQ.html
print(urllib.parse.urljoin('http://www.baidu.com/about.html','https://XerCis.com/FAQ.html?question=2'))#https://XerCis.com/FAQ.html?question=2
print(urllib.parse.urljoin('http://www.baidu.com/about.html?wd=abc','https://XerCis.com/index.php'))#https://XerCis.com/index.php
print(urllib.parse.urljoin('http://www.baidu.com','?category=2#comment'))#http://www.baidu.com?category=2#comment
print(urllib.parse.urljoin('www.baidu.com','?category=2#comment'))#www.baidu.com?category=2#comment
print(urllib.parse.urljoin('www.baidu.com#comment','?category=2'))#www.baidu.com?category=2
#URL请求参数转换:urlencode
print('\nURL字典转请求参数:urlencode')
params = {
'name':'germey',
'age':22
}
base_url = 'http://www.baidu.com?'
url = base_url + urllib.parse.urlencode(params)
print(url)#http://www.baidu.com?name=germey&age=22