前两篇文章都在说在py中用BeautfulSoup爬取本地网页的事情,本来准备去真实网页试一下的,但是老林说不如把你之前学的mysql数据库温习一下,顺道学着把你现在爬到的网页存取到mysql数据库之中~
由此 本文的主题就出现了:
如何在python3中将网页爬虫数据存储到mysql数据库
先小小插播一下:为何标题强调python3!
因为py2与py3连接数据库时用的不是一个库!
PyMySQL 是在 Python3.x 版本中用于连接 MySQL 服务器的一个库,
Python2中则使用mysqldb。
from bs4 import BeautifulSoup
import pymysql
#本地网页爬取数据
#即上一篇文章所学知识
def getData():
datalist = []
with open('D:/Study/Data Analysis/week1/1_2/1_2answer_of_homework/1_2_homework_required/index.html','r')as wb_data:
Soup = BeautifulSoup(wb_data,'lxml')
#print(Soup)
# address = Soup.select('body > div:nth-of-type(3) > div > div.col-md-9 > div:nth-of-type(3) > div:nth-of-type(3) > div > img')
address = Soup.select('body > div > div > div.col-md-9 > div > div > div > img')
price = Soup.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4.pull-right')
title = Soup.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4 > a')
amount = Soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p.pull-right')
stars = Soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p:nth-of-type(2)')
print("getData--end")
print('start-print-data')
for address,price,title,amount,stars in zip(address,price,title,amount,stars):
data = {
'address':address.get('src'),
'price':price.get_text(),
'title':title.get_text(),
'amount':list(amount.stripped_strings)[0],
'stars': len(stars.find_all("span", class_='glyphicon glyphicon-star'))
}
print(data)
datalist.append(data)
print('end-print-data')
return datalist
#数据库中创建新表用以存储
def mysql_create():
mysql_host = 'localhost'
mysql_db = 'school'
mysql_user = 'root'
mysql_password = '123'
mysql_port = 3306
db = pymysql.connect(host=mysql_host, port=mysql_port, user=mysql_user, password=mysql_password, db=mysql_db,charset='utf8') # 连接数据库编码注意是utf8,不然中文结果输出会乱码
sql_create = """CREATE TABLE schoolsheet(
price VARCHAR(10),
title VARCHAR(50),
amount VARCHAR(265),
stars VARCHAR(265),
address VARCHAR(265),
PRIMARY KEY (`price`),
UNIQUE KEY `title`(`title`))ENGINE=InnoDB AUTO_INCREMENT=12 DEFAULT CHARSET=utf8"""
# sql_key="CREATE UNIQUE INDEX id ON schoolsheet(id)"
cursor = db.cursor()
cursor.execute("DROP TABLE IF EXISTS schoolsheet")
cursor.execute(sql_create)# 执行SQL语句
db.commit()
#cursor.execute(sql_key)
db.close() # 关闭数据库连
#存放爬取数据到数据库中
def IntoMysql(datalist):
mysql_host = 'localhost'
mysql_db = 'school'
mysql_user = 'root'
mysql_password = '123'
mysql_port = 3306
db = pymysql.connect(host=mysql_host, port=mysql_port, user=mysql_user, password=mysql_password, db=mysql_db,charset='utf8') # 连接数据库编码注意是utf8,不然中文结果输出会乱码
print('open connect!')
cursor = db.cursor()
print('start-insert-data')
for j in range(len(datalist)):
datarow = datalist[j]
addr = datarow['address']
pric = datarow['price']
titl = datarow['title']
amou = datarow['amount']
star = datarow['stars']
sql = "INSERT INTO schoolsheet(price,title,amount,stars,address)VALUES ('%s','%s','%s','%s','%s')"%(pric,titl,amou,star,addr)
cursor.execute(sql)
db.commit()
db.close()
datalist=getData()
mysql_create()
IntoMysql(datalist)
一切领悟都在代码之中,还不太熟的我准备再多敲一敲领会一下,没有太多讲解,没准等我领会完再来更,毕竟不懂就不瞎BB了!!!
萌星一枚,大神轻喷!!!