一、需求分析:
抓取百度手机助手软件应用,导出EXCEL和插入mysql。字段包括:
1. app_name:应用名称
2. app_pic:应用logo
3. app_score:应用评分
4. app_topic:应用主题
5. app_type:应用分类
6. app_download_num:应用下载量
7. app_size:应用大小
8. app_version:应用版本
9. app_xiaobian:应用小编寄语
10. app_jieshao:应用介绍
11. create_time:抓取时间
抓取网站:http://shouji.baidu.com/software/
二、效果展示:
三、建表语句
CREATE TABLE `t_baidu_info` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`app_name` varchar(64) NOT NULL COMMENT '应用名称',
`app_pic` mediumtext COMMENT '应用logo',
`app_score` varchar(64) DEFAULT NULL COMMENT '应用评分',
`app_topic` varchar(64) DEFAULT NULL COMMENT '应用主题',
`app_type` varchar(64) DEFAULT NULL COMMENT '应用分类',
`app_download_num` varchar(64) DEFAULT NULL COMMENT '应用下载量',
`app_size` varchar(64) DEFAULT NULL COMMENT '应用大小',
`app_version` varchar(64) DEFAULT NULL COMMENT '应用版本',
`app_xiaobian` mediumtext COMMENT '小编介绍评语',
`app_jieshao` mediumtext COMMENT '应用介绍',
`create_time` datetime DEFAULT NULL COMMENT '创建时间',
PRIMARY KEY (`id`),
KEY `Index 2` (`app_name`),
KEY `Index 3` (`app_type`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COMMENT='百度手机助手爬虫表';
四、python 爬虫代码
# encoding: utf-8
from __future__ import division
import time
import sys
reload(sys)
time1=time.time()
sys.setdefaultencoding('utf-8')
import requests
import re
from lxml import etree
import pandas as pd
#########定义抓取的数据结构
app_name=[]
app_pic=[]
app_score=[]
app_topic=[]
app_type=[]
app_download_num=[]
app_size=[]
app_version=[]
app_xiaobian=[]
app_jieshao=[]
create_time=[]
###爬虫地址入口
base_url="http://shouji.baidu.com/software/"
###类别数字
# category_num = [501, 502, 503, 504, 505, 506, 507, 508, 509, 510]
category_num=[501]
###分页编号
# page_num = [1, 2, 3, 4, 5, 6, 7, 8]
page_num = [1]
# 所有应用类别的URLlist
categoryPageURL_list = []
for x in category_num:
for y in page_num:
print base_url + str(x) + '/list_' + str(y) + '.html'
categoryPageURL_list.append(base_url + str(x) + '/list_' + str(y) + '.html')
#爬取所有应用详情页的url
appDetailPageURL_list = []
for url_1 in categoryPageURL_list:
#构造request请求对象
content = requests.get(url_1).content
#re模块用于对正则表达式的支持,pattern可以理解为一个匹配模式,re.S指"."可以匹配换行"\n"
pattern = re.compile('<a class="app-box" href="(.*?)" target="_blank">', re.S)
resultStr = re.findall(pattern, content)
for result in resultStr:
appDetailPageURL = 'http://shouji.baidu.com/' + result
print appDetailPageURL
appDetailPageURL_list.append(appDetailPageURL)
###################循环抓取#################
# url_2=appDetailPageURL_list[0]
for url_2 in appDetailPageURL_list:
try:
html_appDetailPageURL=requests.get(url_2).content
selecor=etree.HTML(html_appDetailPageURL)
######当前时间
import datetime
nowTime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print nowTime
create_time.append(nowTime)
########应用名称
app_name_1=re.findall('<h1 class="app-name">(.*?)</h1>',html_appDetailPageURL,re.S)
app_name_2=re.findall('<span>(.*?)</span>',str(app_name_1[0]),re.S)
for each in app_name_2:
print each
app_name.append(each)
#######应用logo
app_pic_1=re.findall('<img src="(.*?)".*?/>',html_appDetailPageURL,re.S)
print app_pic_1[0]
app_pic.append(app_pic_1[0])
######应用评分
app_score_1=re.findall('<span class="star-xbig"><span class="star-percent" style="width:(.*?)"></span></span>',html_appDetailPageURL,re.S)
app_score_2=float(int(str(app_score_1[0]).replace("%",''))/100)*5
print app_score_2
app_score.append(app_score_2)
#######应用分类
app_type_1=selecor.xpath('//*[@id="doc"]/div[1]/div/span[5]/a/text()')
for each in app_type_1:
print each
app_type.append(each)
######应用主题
app_type_2=selecor.xpath('//*[@id="doc"]/div[1]/div/span[3]/a/text()')
for each in app_type_2:
print each
app_topic.append(each)
#####应用下载量
app_download_num_1=selecor.xpath('//*[@id="doc"]/div[2]/div/div[1]/div/div[2]/div[2]/span[3]/text()')
for each in app_download_num_1:
print str(each).replace("下载次数: ",'')
app_download_num.append(str(each).replace("下载次数: ",''))
#####应用版本
app_version_1=selecor.xpath('//*[@id="doc"]/div[2]/div/div[1]/div/div[2]/div[2]/span[2]/text()')
for each in app_version_1:
print str(each).replace('版本: ','')
app_version.append(str(each).replace('版本: ',''))
#####应用大小
app_size_1=selecor.xpath('//*[@id="doc"]/div[2]/div/div[1]/div/div[2]/div[2]/span[1]/text()')
for each in app_size_1:
print str(each).replace('大小: ','')
app_size.append(str(each).replace('大小: ',''))
######应用小编评语
app_xiaobian_1=selecor.xpath('//*[@id="doc"]/div[2]/div/div[2]/div[1]/div[1]/span[2]/text()')
if len(app_xiaobian_1)>0:
for each in app_xiaobian_1:
print each
app_xiaobian.append(each)
else:
print "无小编评语"
app_xiaobian.append('无小编评语')
#####应用介绍
app_jieshao_1=re.findall('<p class="content content_hover">(.*?)<span class="occupied"></span></p>',html_appDetailPageURL,re.S)
if len(app_jieshao_1)>0:
print app_jieshao_1[0]
app_jieshao.append(app_jieshao_1[0])
else:
app_jieshao.append("无应用介绍")
except Exception, ex:
print Exception, ":", ex
print len(app_name),len(app_pic),len(app_name),len(app_score),len(app_name),len(app_topic),len(app_type),len(app_download_num),len(app_size),\
len(app_version),len(app_xiaobian),len(app_jieshao),len(create_time)
data=pd.DataFrame({"app_name":app_name,"app_pic":app_pic,"app_score":app_score,"app_topic":app_topic,"app_type":app_type,"app_download_num":app_download_num,"app_size":app_size,
"app_version":app_version,"app_xiaobian":app_xiaobian,"app_jieshao":app_jieshao,"create_time":create_time
})
print data
###############写入EXCEL##############
pd.DataFrame.to_excel(data,u"C:\\Users\\Administrator\\Desktop\\风控模型--赖德发\百度手机助手爬虫\\t_baidu_info.xlsx",header=True,encoding='gbk',index=False)
############################先连上数据库##########################
import pymysql
## 加上字符集参数,防止中文乱码
dbconn=pymysql.connect(
host="127.0.0.1",
database="cgjr",
user="root",
password="12345",
port=3306,
charset='utf8'
)
###################################################################
##############################写入mysql数据库#################################
# 执行sql语句
try:
with dbconn.cursor() as cursor:
# 执行sql语句,插入记录
sql = 'INSERT INTO t_baidu_info (app_name, app_pic, app_score, app_topic, app_type,app_download_num,app_size,app_version,app_xiaobian,app_jieshao,create_time) VALUES (%s, %s, %s, %s, %s,%s,%s,%s,%s,%s,%s)'
for i in range(0, len(data)):
print "正在插入数据:" + str(i)
cursor.execute(sql, (str(data.iloc[i, 2]), str(data.iloc[i,3]), str(data.iloc[i,4]), data.iloc[i,6], data.iloc[i,7], data.iloc[i,0],data.iloc[i,5], data.iloc[i,8],data.iloc[i,9],data.iloc[i,1],data.iloc[i,10]))
# 没有设置默认自动提交,需要主动提交,以保存所执行的语句
dbconn.commit()
except dbconn.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
sys.exit(1)
finally:
dbconn.close()
print ('数据已插入,插入数据库成功!')
time2 = time.time()
print u'总共耗时:' + str(time2 - time1) + 's'