原来写过一篇scrapy的介绍,说了下scrapy的环境如何配置,该篇博客地址是:win10 python安装及环境配置、scrapy框架安装及PyCharm集成
本篇会从一个实际的例子当中记录scrapy的使用
大家都对三国很熟,下面我们从 三国在线(http://www.e3ol.com/biography-index.html)来获取三国人物数据,获取三国人物数据的整体代码如下,本代码抓取数据的网址返回的是JSON格式的数据,本代码将解析该JSON数据,并将其按json的键创建数据表,保存人物信息
import scrapy
import json
import pymysql
import re
from sgyyScrapy.items import SgyyscrapyItem
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
class sgyyScrapy(scrapy.Spider):
name = "sgyyScrapy"
allowed_domins = ["http://www.e3ol.com/"]
start_urls = []
isCreateTable = False
def start_requests(self):
global headers
# 三国在线 通过主效势力去选 完整地址 http://www.e3ol.com/biography/inc_ajax.asp?types=index&a2=%s&pageno=1
urlhead = 'http://www.e3ol.com/biography/inc_ajax.asp?types=index&a2=%s'
for i in range(14):
baseUrl = urlhead % (i+1) + '&pageno=%s'
for qy in range(50):
url = baseUrl % (qy+1)
self.start_urls.append(url)
for url in self.start_urls:
# print (url)
yield scrapy.Request(url, headers=headers, callback=self.parse)
def parse(self, response):
jsonStr = response.body_as_unicode()
# 返回的数据是unicode编码,中文都被解析成\u4e2d\u6587(中文)这类的字符串了,所以通过下面的方法将其转换成中文
encodeStr = jsonStr.encode('utf-8').decode('unicode_escape')
encodeJsonStr = encodeStr[1:len(encodeStr)-1]
encodeJsonStr = encodeJsonStr.replace(" ","")
print(encodeJsonStr)
# 返回的json的key没有‘’,会导致json.loads出错,在此给key添加''
reEncodeStr = self.quote_keys_for_json(encodeJsonStr)
print(reEncodeStr)
# JSON转换成对象
jsonObject = json.loads(reEncodeStr.replace("'", "\""))
# 数据库连接
db = pymysql.connect(host = "127.0.0.1", port = 3306, user = "root",password = "zhl",database = "sgyy",charset='utf8')
cursor = db.cursor()
for item in jsonObject['soul']:
joi = 0
jsonObjectNum = len(item)
createSQL = ""
insertSQL = ""
insertSQLValue = ""
if self.isCreateTable == False:
for key in item:
joi = joi + 1
if joi >= jsonObjectNum:
createSQL = createSQL + key + " varchar(1000))"
insertSQL = insertSQL + key + ")"
insertSQLValue = insertSQLValue + "'%s')" % item[key]
else:
if joi == 1:
createSQL = "create table sgyy_person(" + key + " varchar(1000),"
insertSQL = "insert into sgyy_person(" + key + ","
insertSQLValue = insertSQLValue + " values ('%s'," % item[key]
else:
createSQL = createSQL + key + " varchar(1000),"
insertSQL = insertSQL + "" +key +","
insertSQLValue = insertSQLValue + "'%s'," % item[key]
try:
print(createSQL)
cursor.execute("DROP TABLE IF EXISTS sgyy_person")
cursor.execute(createSQL)
insertFinal = insertSQL+insertSQLValue
print(insertFinal)
cursor.execute(insertFinal)
db.commit()
except:
print("发生错误,回滚事务")
db.rollback()
self.isCreateTable = True
else:
for key in item:
joi = joi + 1
if joi >= jsonObjectNum:
insertSQL = insertSQL + key + ")"
insertSQLValue = insertSQLValue + "'%s')" % item[key]
else:
if joi == 1:
insertSQL = "insert into sgyy_person(" + key + ","
insertSQLValue = insertSQLValue + " values ('%s'," % item[key]
else:
insertSQL = insertSQL + "" +key +","
insertSQLValue = insertSQLValue + "'%s'," % item[key]
try:
insertFinal = insertSQL + insertSQLValue
print(insertFinal)
cursor.execute(insertFinal)
db.commit()
except:
print("发生错误,回滚事务")
db.rollback()
cursor.close()
db.close()
print("结束")
def quote_keys_for_json(self,json_str):
# """给键值不带双引号的json字符串的所有键值加上双引号。
# 注:解析一般的不严格的json串,可以checkout https://github.com/dmeranda/demjson, 速度比标准库要慢。"""
quote_pat = re.compile(r'".*?"')
a = quote_pat.findall(json_str)
json_str = quote_pat.sub('@', json_str)
key_pat = re.compile(r'(\w+):')
json_str = key_pat.sub(r'"\1":', json_str)
assert json_str.count('@') == len(a)
count = -1
def put_back_values(match):
nonlocal count
count += 1
return a[count]
json_str = re.sub('@', put_back_values, json_str)
return json_str