import re
from urllib.request import urlopen
baseurl = 'https://blog.csdn.net/gf_lvah/article/list/'
pages = 7
def get_page(url):
return urlopen(url).read().decode('utf-8')
def parse_content(content):
pattern = r' <a href="(.*?)" target="_blank">\s+<span class="article-type type-1">\s+.*?</span>\s+(.*?)\s+</a>'
return re.findall(pattern, content)
List = []
for i in range(7):
url = baseurl + str(i + 1)
print(url)
content = get_page(url)
print("爬取第%d页" % (i + 1))
print(parse_content(content))
List.extend(parse_content(content))
with open('csdn.txt', 'w') as f:
l = len(List)
for url, name in List:
print(url,name)
name = name.strip()
f.write('[ 第%s篇博文 : %s ](%s)\n\n' %(l, name, url))
l -= 1
批量制作博文清单
猜你喜欢
转载自blog.csdn.net/gf_lvah/article/details/81171759
今日推荐
周排行