入门爬虫（函数封装）-Python

~~以面向过程的思维，~~第一个入门爬虫。运用模块：requests,os,re。

第一个爬虫，仅体验爬虫乐趣。所以指定访问url，没有采用bs4模块，~~没有函数封装~~，亦没有编写为对象。日后会对本脚本改进。仅仅是分享第一个简单爬虫的乐趣XD。

~~思路为：1.解析url（requests模块） 2.得到源码text 3.正则过滤text（re模块） 4.下载信息 5.清洗信息 6.信息储存~~

import requests
import re
import os

def url2text(url):
	temp_url = url
	temp_rsp = requests.get(temp_url)
	temp_rsp.encoding = 'utf-8'
	temp_tex = temp_rsp.text
	return temp_tex
	
def save_path(path,title):
	temp_path = path + title
	if not os.path.exists(temp_path):
		os.mkdir(temp_path)
	os.chdir(temp_path)

def chap_item_get(url_list):
	c_url = []
	c_tit = []
	c_list = []
	for each in url_list:
		temp_url,temp_tit = each
		temp_tit = temp_tit.lstrip()
		if 'book' not in temp_url:
			temp_url ='http://www.8wenku.com%s' %temp_url
			c_url.append(temp_url)
			c_tit.append(temp_tit)
	c_list.append(c_url)
	c_list.append(c_tit)
	return c_list
	
def chap_download(item_list):
	list_leng = len(item_list[0])
	for  i in range(list_leng):
		chp_tit = item_list[1][i]
		print(chp_tit)
		if 	'章'	in chp_tit or \
			'rolo'	in chp_tit or \
			'pilo'	in chp_tit or \
			'PILO'	in chp_tit or \
			'pillo'	in chp_tit or \
			'过场'	in chp_tit or \
			'幕间'	in chp_tit or \
			'后记'	in chp_tit or \
			'hap'	in chp_tit:		
			fb = open('%s.txt' %chp_tit,'w',encoding='utf-8')
			chp_text = url2text(item_list[0][i])
			chp_cont = re.findall(r'为你一网打尽！<br><br />(.*?)</div>',chp_text,re.S)[0]
			chp_cont = chp_cont.replace('<br />','')
			fb.write(chp_tit)
			fb.write(chp_cont)
			fb.close()
			cha_cont = []
			print('%s 爬取成功' %chp_tit)
			print('='*60)
		else:
			print('%s 跳过！！' %chp_tit)	
			print('='*60)
		
url = 'http://www.8wenku.com/book/1498'
path = input('请输入保存路径：')

def download_novel(path,url):
	
	html_text = url2text(url)
	title = re.findall(r'<h2 class="tit">《(.*?)》</h2>',html_text)[0]
	url_list = re.findall(r'<a target="_blank" href="(.*?)">(.*?)</a>',html_text)

	save_path(path,title)
	
	item_list = chap_item_get(url_list)
	
	chap_download(item_list)
		
download_novel(path,url)

入门爬虫（函数封装）-Python

猜你喜欢