# coding:utf-8
#!/usr/bin/env python
'''
Created on 2016年2月13日
@author: xingjiarong
使用python爬取csdn个人博客的访问量,主要用来练手
'''
import urllib2
import re
from bs4 import BeautifulSoup
import sys
import chardet
reload(sys)
sys.setdefaultencoding('utf-8')
type = sys.getfilesystemencoding()
# if sys.stdout.encoding != 'UTF-8':
# sys.stdout = codecs.getwriter('utf-8')(sys.stdout, 'strict')
# if sys.stderr.encoding != 'UTF-8':
# sys.stderr = codecs.getwriter('utf-8')(sys.stderr, 'strict')
# a = "我就是中文啊"
# print(chardet.detect(a))
# print("a:",a)
# print("a:",u'哈哈')
# print("h",u"haha")
#当前的博客列表页号
now_page = 1
#最后一页列表号
last_page = 2
all_url = []
all_id = []
account = 'hehedadaq'
baseUrl = 'http://blog.csdn.net/'+account
class Blog(object):
def __init__(self,num,id):
self.id = id
self.num = num
print("self.num :",self.num)
self.page = self.get_page()
self.title = self.get_title()
self.page_view = self.get_page_view()
def get_page(self):
self.url = baseUrl+'/article/details/'+str(self.id)
print(self.url)
return getPage(self.url)
def get_title(self):
# pattern = re.compile(r'<h1.*?class="title-article">(.*?)</h1>')
# title = pattern.findall(self.page)
# title = str(title)[2:-2].encode('ascii').decode('unicode_escape')
# title = title.encode('unicode_escape').decode('string_escape')
# # print(chardet.detect(title))
# print("title:",title)
soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
title2 = soup.find('h1',class_ = 'title-article')
title2 = title2.get_text().encode('utf-8')
print(chardet.detect(title2))
# print("title2:",title2)
print title2.decode('utf-8').encode('GBK')
return title
def get_page_view(self):
soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
pattern = re.compile(r'<span.*?class="read-count">阅读数:(.*?)</span>') # 查找数字
# print(self.page)
page_view = pattern.findall(self.page)
page_view = str(page_view)[2:-2]
# title = re.findall('<span class="read-count">(.*?)</span>',self.page,re.S)
print("the %d blog page_view is %s"%(self.num,page_view))
return page_view
def getPage(url):
#伪装成浏览器访问,直接访问的话csdn会拒绝
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent':user_agent}
#构造请求
print("url:",url)
req = urllib2.Request(url,headers=headers)
#访问页面
myResponse = urllib2.urlopen(req)
myPage = myResponse.read()
# demo_page = "页面demo"
# print("demo ",demo_page)
# print(chardet.detect(demo_page))
# print("page 页面属性")
# print(chardet.detect(myPage))
# print(myPage)
return myPage
while now_page <= last_page:
print'-----------------------------the %d page ---------------------------------' % (now_page,)
# 获取网页源码
myUrl = baseUrl+'/article/list/'+str(now_page)
myPage = getPage(myUrl)
idList = re.findall('data-articleid=".*?"',myPage,re.S)
for id in idList:
# print("id:",id)
pattern = re.compile(r'\d+') # 查找数字
url_id = pattern.findall(id)
url_id = str(map(int, url_id))
url_id = url_id[1:-1]
all_id.append(url_id)
all_url.append(baseUrl+'/'+url_id)
# print("all_url",all_url)
# print("length",len(all_url))
title = re.findall('<span class="link_title"><a href=".*?">(.*?)</a></span>',myPage,re.S)
titleList=[]
for items in title:
titleList.append(str(items).lstrip().rstrip())
#利用正则表达式获取博客的访问量
# view = re.findall('<span class="link_view".*?><a href=".*?" title="阅读次数">阅读</a>\((.*?)\)</span>',myPage,re.S)
# viewList=[]
# for items in view:
# viewList.append(str(items).lstrip().rstrip())
#将结果输出
for n in range(len(titleList)):
print('page_view:%s title:%s' % (viewList[n].zfill(4),titleList[n]))
#页号加1
now_page = now_page + 1
i = 1
for id in all_id:
print("id:",id)
locals()['blog_'+str(i)] = Blog(i,id)
i += 1
爬虫demo_草稿
猜你喜欢
转载自blog.csdn.net/hehedadaq/article/details/81749140
今日推荐
周排行