思路:用循环爬取n(任意数字)页代码,然后保存在文件里
一共三个函数:
- get_html函数是用来爬取页面
- save_html函数用来把爬取来的页面代码保存在文件中
- main作为主函数
代码如下:
from urllib.request import urlopen,Request
from urllib.parse import urlencode
def get_html(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
}
request = Request(url, headers=headers)
response = urlopen(request)
info = response.read()
print(info.decode())
return info
def save_html(filename,html_bytes):
with open(filename,"wb") as f:
f.write(html_bytes)
def main():
content=input("要下载的内容:")
num=input('下载的页数:')
base_url="http://tieba.baidu.com/f?ie=utf-8{}"
for pn in range(int(num)):
args={
"pn":pn*50,
"kw":content
}
filename="第"+str(pn+1)+"页.html"
url=base_url.format(urlencode(args))
print("正在下载"+filename)
html_bytes=get_html(url)
save_html(filename,html_bytes)
if __name__ == '__main__':
main()