#info,
data = urllib.request.urlopen("https://blog.csdn.net/qq_40666620/article/details/102834104")print(data.info())
状态码,就可以找失效的连接什么的
#getcode:print(data.getcode())
获取当前爬取的url地址
#geturl:print(data.geturl())
timeout超时设置
for i inrange(0,100):try:
data = urllib.request.urlopen("https://blog.csdn.net/qq_40666620/article/details/102834104",timeout=0.1).read()print("success")except Exception as error:print(error)
自动模拟http请求
import re
#post,get#get:
keyword ="python"
keyword = urllib.request.quote(keyword)
url="http://www.baidu.com/s?wd="+keyword
target ='title":"(.*?)"'#print(data)for pn inrange(0,10):#9*pn是因为现在百度一页是9条信息,pn已经不是页数了
data = urllib.request.urlopen(url+"&pn="+str(9*pn)).read().decode("utf-8")
result = re.compile(target).findall(data)for i inrange(0,len(result)):print(result[i])