目标:爬取糗事百科的段子
代码:
# -*- coding: utf-8 -*- __author__ = 'beauty'
import sys
type = sys.getfilesystemencoding() #为了防止出现乱码
import urllib2 import re page = 1 url = 'http://www.qiushibaike.com/hot/page/' + str(page) user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } try: request = urllib2.Request(url,headers = headers) response = urllib2.urlopen(request) content = response.read().decode('utf-8') # print content.encode(type) pattern = re.compile('<div class="author clearfix">.*?href.*?<img src.*?title=.*?<h2>(.*?)</h2>.*?<div class="content">(.*?)</div>.*?<i class="number">(.*?)</i>',re.S) items = re.findall(pattern,content) # print items for item in items: print item[0].encode(type),item[1].encode(type),item[2].encode(type) except urllib2.URLError, e: if hasattr(e,"code"): print e.code if hasattr(e,"reason"): print e.reason在pycharm中的运行结果: