用python把旧的内容做下备份。
# -*- coding:utf-8 -*- import urllib.request from urllib import request from bs4 import BeautifulSoup import sqlite3 domain="TTTT";####此处修改为你的博客域名 url = "http://"+domain+".blog.sohu.com/entry/" urlFile = urllib.request.urlopen(url) data = urlFile.read() urlFile.close() data = data.decode('utf-8',errors='ignore') print("get page success") pre = "var _ebi = \'" index1 = data.find(pre) + len(pre) index2 = data.find('\'', index1) ebi=data[index1 : index2]; print("ebi:"+ebi) pre = "var totalCount = " index1 = data.find(pre) + len(pre) index2 = data.find(';', index1) print("totalcount:"+data[index1 : index2]) totalPage=""; if (int(data[index1 : index2]))%20>0: totalPage=str(int(int(data[index1 : index2])/20+1)) else: totalPage=str(int(int(data[index1 : index2])/20)) print("totalpage:"+totalPage); def getBlogList(pageId): url="http://"+domain+".blog.sohu.com/action/v_frag-ebi_"+ebi+"-pg_"+pageId+"/entry/"; print("get url:"+url); #1.获取页面内容html with request.urlopen(url) as f: html_doc=f.read() html_doc = html_doc.decode('utf-8',errors='ignore') #2.分析页面内容,获取标题内容和链接[格式如下] #<h2 class="news_entry"> # <a href="/n/535728/" target="_blank">传Windows 10 Mobile Build 11088下月初发布</a> #</h2> soup = BeautifulSoup(html_doc,"html.parser") news_array=soup.find_all('div', {'class': 'newBlog-list-title'}) for news in news_array: if news.a: print(news.a.get("href"))#获取链接 save(news.a.get("href")) #print(news.a.string)#获取标题 def save(link,title=None): if title is None: title="" conn = sqlite3.connect('blog.db') cursor = conn.cursor() # 执行一条SQL语句,创建user表: cursor.execute('create table IF NOT EXISTS blog (id INTEGER PRIMARY KEY, title varchar(100),link vachar(100),content text,postdate varchar(100),status Integer)') cursor.execute('select * from blog where link=\''+link+'\'') values=cursor.fetchall() if len(values) > 0:#链接以前就存在 print('链接已经存在:'+link) else: cursor.execute('insert into blog (title, link,status) values (\''+title+'\', \''+link+'\',0)') conn.commit() print("save success."+link) # 关闭Cursor: cursor.close() # 提交事务: conn.commit() # 关闭Connection: conn.close() for x in range(1,int(totalPage)+1): #代表从1到5(不包含5)9000-9700 errorLink=[] try: getBlogList(str(x)) except Exception as e: print('except:', e) errorLink.append(x) print("errorLink:"+str(errorLink));
2.抓取内容页面,将内容保存到数据库中
# -*- coding:utf-8 -*- from bs4 import BeautifulSoup import urllib.request from urllib import request # 导入SQLite驱动: import sqlite3 def updateContent(): conn = sqlite3.connect('blog.db') cursor = conn.cursor() cursor.execute('select * from blog where status=0') values = cursor.fetchall() for line in values: id=line[0] link=line[2] soup=getContent(link) try: title=soup.find('div', {'class': 'item-body'}).h2.span.get_text() postdate=soup.find('span', {'class': 'date'}).get_text(); content=str(soup.find('div', {'class': 'item-content'}))#.get_text() end = "<div class=\"clear\"></div>" content=content[45:content.find(end)] print(link) cursor.execute('update blog set title=?,content=?,status=1,postdate=? where id=?',(title,content, postdate,id)) conn.commit() except Exception as e: print('except:', e) cursor.close() conn.commit() conn.close() #根据链接获取内容 def getContent(link): #1.获取页面内容html html_doc=""; #构造header,一般header至少要包含一下两项。这两项是从抓到的包里分析得出的。 headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0', 'Referer' : link} #打开登录主页面(他的目的是从页面下载cookie,这样我们在再送post数据时就有cookie了,否则发送不成功) try: #with request.urlopen(link) as f: # html_doc=f.read() request = urllib.request.Request(link, None, headers) html_doc=urllib.request.urlopen(request).read() except Exception as e: print('except:', e) #2.分析页面内容,获取内容 soup = BeautifulSoup(html_doc,"html.parser") return soup #将所有没有内容的新闻,抓取一下,将内容填充进去 updateContent()