# 模拟浏览器 headers = ( "User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36")
常用的“User-Agent":
ua_list = [
"Mozilla/5.0 (Windows NT 6.1; ) Apple.... ",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0)... ",
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X.... ",
"Mozilla/5.0 (Macintosh; Intel Mac OS... "
]
user_agent = random.choice(ua_list)
两种让爬虫模拟成浏览器的方法:
方法1:使用build_opener()修改报头
由于urlopen()不支持一些HTTP的高级功能,所以,我们如果要修改报头,可以使用urllib.request.build_opener()进行
比如:
url = "http://blog.csdn.net/weiwei_pig/article/details/51178226" header = (“User-Agent”,“Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36”) opener = urllib.request.build_opener()#创建build_opener操作对象 opener.addheaders = [header]#添加报头信息 data = opener.open(url).read()#接收返回信息,并读取 #此时已经模仿为浏览器去打开,我们保存爬到的信息 fhandle = open(“F:/python/part4/3.html”,“wb”) a = fhandle.write(data)#print(a)查看写入的字节数 fhandle.close()方法2:使用add_header()添加报头
除了上述方法,还可以使用urllib.request.Request()下的add_header()实现浏览器的模拟:
import urllib.request url = "http://blog.csdn.net/weiwei_pig/article/details/51178226" req = urllib.request.Request(url)#创建请求对象 req.add_header(“User-Agent”,“Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36”) data = urllib.request.urlopen(req).read() data = data.decode(“utf-8”)#转码,将原始的数据以utf-8编码的形式转换出来 print(data)