BeautifulSoup获取博客列表

安装

pip install beautifulsoup4

建议安装lxml,作为beautiful soup的内置解析器

对于windows,到 http://www.lfd.uci.edu/~gohlke/pythonlibs/ 搜索下载并安装

pip3 install "lxml-3.6.0-cp35-cp35m-win_amd64.whl"

核心方法

http://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/

find(name, attrs, recursive, text, **kwargs)

find_all(name, attrs, recursive, text, **kwargs)

name对应tag名字,比如'div'

attrs对应tag的属性,比如id='myid',对于class直接输入名字,不可使用class='xxx',可以设置True和False来过滤tag是否拥有该属性

tag可以通过 . 运算符依次获取下一级的tag,e.g. tag.div.a

tag的内容可以通过string获取,属性可以通过get('attr_name')获取

代码

import requests
from bs4 import BeautifulSoup

user_root_blog = 'http://xuanzhui.iteye.com/'

# 伪装一下
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36'}

page_str = requests.get(user_root_blog, headers = headers).text

# 解析一下总共多少页
soup = BeautifulSoup(page_str, 'lxml')
# 分页数据的根节点
page_div = soup.find('div', 'pagination')

total_page = 1

if page_div:
	page_tags = page_div.find_all('a', False)
	page_arr = [int(page.string) for page in page_tags if page.string.isdigit()]

	if page_arr:
		total_page = max(page_arr)

print('total page:', total_page)


# 解析blog的函数
def parse_to_get_blogs(page_str):
	soup = BeautifulSoup(page_str, 'lxml')

	# 得到文章列表标题的所有节点
	title_tags = soup.find_all('div', 'blog_title')

	if not title_tags:
		return

	url_pref = user_root_blog[:-1]
	return [(url_pref + tag.h3.a.get('href'), tag.h3.a.string) for tag in title_tags]


blogs = parse_to_get_blogs(page_str)

# 如果没有文章
if not blogs:
	print("no valid titles")
	exit(0)

for i in range(2, total_page + 1):
	url = user_root_blog + '?page=' + str(i)
	print('parsing ', url)
	page_str = requests.get(url, headers=headers).text
	blogs.extend(parse_to_get_blogs(page_str))

with open('blogs.txt', 'w') as f:
	for tmp in blogs:
		f.write(tmp[0])
		f.write('\n')
		f.write(tmp[1])
		f.write('\n\n')

猜你喜欢

转载自xuanzhui.iteye.com/blog/2285507