目标:
爬取这个网站的所有PDF。https://github.com/THUNLP-MT/MT-Reading-List#syntax_based_models
下载一个网站的所有PDF:
#file-name: pdf_download.py
__author__ = 'rxread'
import requests
from bs4 import BeautifulSoup
def download_file(url, index):
local_filename = index+"-"+url.split('/')[-1]
# NOTE the stream=True parameter
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.flush()
return local_filename
#http://ww0.java4.datastructures.net/handouts/
root_link="https://github.com/THUNLP-MT/MT-Reading-List#syntax_based_models"
r=requests.get(root_link)
if r.status_code==200:
soup=BeautifulSoup(r.text)
# print soup.prettify()
index=1
for link in soup.find_all('a'):
new_link=root_link+link.get('href')
if new_link.endswith(".pdf"):
file_path=download_file(new_link,str(index))
print("downloading:"+new_link+" -> "+file_path)
index+=1
print("all download finished")
else:
print("errors occur.")
下载一个网站所有的链接:
from urllib.request import urlopen#用于获取网页
from bs4 import BeautifulSoup#用于解析网页
html = urlopen('https://github.com/THUNLP-MT/MT-Reading-List#syntax_based_models')
bsObj = BeautifulSoup(html, 'html.parser')
t1 = bsObj.find_all('a')
for t2 in t1:
t3 = t2.get('href')
print(t3)
参考博客:
https://blog.csdn.net/bull521/article/details/83448781
https://blog.csdn.net/qq_35193302/article/details/83510213
http://blog.zanlabs.com/2014/11/11/python-webpage-crawling/
https://blog.csdn.net/baidu_28479651/article/details/76158051