# -*- coding: UTF-8 -*- # Python: 2.7.8 # Platform: Windows # Program: Get Novels From Internet # Author: dxl # Description: Get Novels # Version: 1.0 # History: import urllib2,os,codecs from bs4 import BeautifulSoup #跟网址 http://lknovel.lightnovel.cn/main/vollist/66.html url='' #存储路径 title_path='' #抽取正则 reg = '(?<=a href=")http:[^\s].+(?=")' #request消息头 heads = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset':'GB2312,utf-8;q=0.7,*;q=0.7', 'Accept-Language':'zh-cn,zh;q=0.5', 'Cache-Control':'max-age=0', 'Connection':'keep-alive', 'Host':'John', 'Keep-Alive':'115', 'Referer':url, 'User-Agent':'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.14) Gecko/20110221 Ubuntu/10.10 (maverick) Firefox/3.6.14'} #获取网页信息 def getHtml(url): opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) urllib2.install_opener(opener) req = urllib2.Request(url) opener.addheaders = heads.items() respHtml = opener.open(req).read() return respHtml; #获取小说列表 def getTextList(url): urls=[] respHtml = getHtml(url) soup = BeautifulSoup(respHtml) list = soup.find_all('dd') for x in list: urls.append(x.find_all('a')[1].get('href')) return urls #获取文本信息到本地 def loadText(urls): for url in urls : load(url) #下载资源 def load(url): soup = BeautifulSoup(getHtml(url)) title=soup.find_all("h1",class_='ft-24') title_name= title[0].find_all('strong')[0].get_text().replace('\r\n','').replace('\t','') title_path='D:/pms_branches/MyPython/src/com/dxl/%s'%title_name if not os.path.exists(title_path): os.mkdir(title_path) list= soup.find_all("li",class_='span3') for x in list: title_list_name= x.find_all('a')[0].find_all('span')[0].get_text().replace('\r\n','').replace('\t','').replace('?','').replace('<','').replace('>','').replace('|','').replace('*','').replace('"','').replace(':','') title_list_path= title_path+'\%s'%title_list_name if not os.path.exists(title_list_path): soup_html=BeautifulSoup(getHtml(x.find_all('a')[0].get('href'))) text_lists=soup_html.find_all("div",class_='lk-view-line') with codecs.open(title_list_path,'wb','GB18030') as fp: for text in text_lists: fp.write(text.get_text()) print '完成%s'%title_list_name if __name__=="__main__": # url=raw_input("""输入目标网址\n 按回车键结束\n""") url='http://lknovel.lightnovel.cn/main/vollist/573.html' urls=getTextList(url) loadText(urls)
第一次用python写爬虫
刚开始接触python,看网上好多都说写写爬虫锻炼一下,于是乎周末加班的时候边查边学的写了个轻之国度小说的脚本,会有很多不足之处,仅仅是锻炼尝试用python写代码~~
猜你喜欢
转载自ssdfz001.iteye.com/blog/2228685
今日推荐
周排行