安装
pip install beautifulsoup4
建议安装lxml,作为beautiful soup的内置解析器
对于windows,到 http://www.lfd.uci.edu/~gohlke/pythonlibs/ 搜索下载并安装
pip3 install "lxml-3.6.0-cp35-cp35m-win_amd64.whl"
核心方法
http://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
find(name, attrs, recursive, text, **kwargs)
find_all(name, attrs, recursive, text, **kwargs)
name对应tag名字,比如'div'
attrs对应tag的属性,比如id='myid',对于class直接输入名字,不可使用class='xxx',可以设置True和False来过滤tag是否拥有该属性
tag可以通过 . 运算符依次获取下一级的tag,e.g. tag.div.a
tag的内容可以通过string获取,属性可以通过get('attr_name')获取
代码
import requests from bs4 import BeautifulSoup user_root_blog = 'http://xuanzhui.iteye.com/' # 伪装一下 headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36'} page_str = requests.get(user_root_blog, headers = headers).text # 解析一下总共多少页 soup = BeautifulSoup(page_str, 'lxml') # 分页数据的根节点 page_div = soup.find('div', 'pagination') total_page = 1 if page_div: page_tags = page_div.find_all('a', False) page_arr = [int(page.string) for page in page_tags if page.string.isdigit()] if page_arr: total_page = max(page_arr) print('total page:', total_page) # 解析blog的函数 def parse_to_get_blogs(page_str): soup = BeautifulSoup(page_str, 'lxml') # 得到文章列表标题的所有节点 title_tags = soup.find_all('div', 'blog_title') if not title_tags: return url_pref = user_root_blog[:-1] return [(url_pref + tag.h3.a.get('href'), tag.h3.a.string) for tag in title_tags] blogs = parse_to_get_blogs(page_str) # 如果没有文章 if not blogs: print("no valid titles") exit(0) for i in range(2, total_page + 1): url = user_root_blog + '?page=' + str(i) print('parsing ', url) page_str = requests.get(url, headers=headers).text blogs.extend(parse_to_get_blogs(page_str)) with open('blogs.txt', 'w') as f: for tmp in blogs: f.write(tmp[0]) f.write('\n') f.write(tmp[1]) f.write('\n\n')