可代写python爬虫,收费可协商,用途需提前说明。
下面爬虫爬到的数据有100天左右,100家众筹的完整数据,需要的或者有写爬虫需求的同学可发邮件至[email protected]
获取,暂无收费,合作意向的同学在标题说明合作内容,心理价格。看到后我会很快回复(除了在外旅游的时候),不必着急。
下列代码也请勿用于商业!!!仅供经济管理商学同学交流学习,技术同学相互学习使用!!代码虽简单,爬虫 需谨慎,请勿用分布式大规模爬虫爬取其他网站数据!
高校的同学做了一些相关的学术研究,给她写了一个在服务器上运行的爬虫
自动运行设置起来其实很快也不麻烦,
比如说利用 crontab 或者是Linux自带的其他定时运行设置。
这个自己搜一下即可。
下面放代码。写得比较早了,应该是python2.7的,
自己改动一下print后面的括号就改成3.0的了,也不麻烦
京东公司信息爬取:
# -*- coding: utf-8 -*-__author__ = 'EasouChen'
# 导入以下模块
# selenium用于结合phantomjs
from selenium import webdriver
import traceback
import datetime
import time
from lxml import etree
# 底下这行用于自定义头部文件
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import pymysql
# 多进程池,用于多进程
from multiprocessing import Pool
# 使用该函数将中文转换成url参数
from urllib.parse import quote
# 这三行用于解决mysql报ascii无法decode的问题,意思是将所有字符格式default为'utf-8'
# import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')
# 定义函数,参数为页数
def get_goods(key, page_num):
'''
用于爬取商品信息,包括标题,价格,评论,详细页链接,店名,店铺链接'
:param key: 爬取的关键字,如
:param page_num: 第几页
:return:
'''
# 链接数据库
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='jd', use_unicode=True, charset="utf8")
# 自定义userAgent,并使用该参数访问页面
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# https://zc.suning.com/project/browseList.htm?c=&t=&s=02&keyWords=%E8%AF%B7%E8%BE%93%E5%85%A5%E5%85%B3%E9%94%AE%E5%AD%97
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get('https://z.jd.com/bigger/search.html?from=zchome&status=2&page=%s' % (page_num))
# dcap = dict(DesiredCapabilities.PHANTOMJS)
# dcap['phantomjs.page.settings.userAgent'] = (
# "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0")
# driver = webdriver.PhantomJS(desired_capabilities=dcap)
# 注意链接经过url处理,原因是phantomjs对url中的中文识别成了??,无法正常处理
# 该网址中的%E9%9B%B6%E9%A3%9F 可自定义搜索条件,并使用urllib转换
# driver.get('https://search.jd.com/Search?keyword=%s&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.his.0.0' % (
# key) + '&page=%s&s=57&click=0' % (page_num * 2 - 1))
# 打开页面后,等待两秒,下拉到页面底部,做二次加载
js = "window.scrollTo(0,document.body.scrollHeight);"
time.sleep(2)
driver.execute_script(js)
time.sleep(4)
# 将网页数据初始化,用lxml模块处理
htmls = etree.HTML(driver.page_source)
# 获取商品列表
goods_list = htmls.xpath("//div[@class='l-info']//div[@class='l-result']"
"/ul[@class='infos clearfix']/li[@class='info type_now']")
# print(goods_list)
count = 1
for item in goods_list:
# 遍历商品列表,从列表中得到每个商品的具体信息
try:
# https://zc.suning.com/project/detail.htm?projectId=22027
# title1 = item.xpath("./div/div[contains(@class,'p-name')]/a/em")[0]
link = "https://z.jd.com" + item.xpath("./div[@class='i-tits no-color-choose'or'i-tits ']/a/@href")[0]
print(link)
driver.get(link)
# 将网页数据初始化,用lxml模块处理
info = etree.HTML(driver.page_source)
# 名字
title = info.xpath("//div[@class='project-introduce']/h1[@class='p-title']")[0]
title = title.xpath("string(.)")
title = title.replace('(', '<').replace(')', '>')
print(title)
#公司介绍
intro=info.xpath("//ul[@class='contact-box']/li[@class='clearfix contact-li'][2]/div[@class='val']")[0]
intro = intro.xpath("string(.)")
# 联系电话
tele=info.xpath("//ul[@class='contact-box']/li[@class='clearfix contact-li'][3]/div[@class='val']")[0]
tele = tele.xpath("string(.)")
if "400" in tele:
tele_type="1"
else:
tele_type="0"
# 最高金额
high_money = info.xpath("//div[@class='t-price ']/span")
new_num=[];
max=0;
for n in high_money:
new_num.append(int(n.xpath("string(.)")));
if int(n.xpath("string(.)"))>max:
max=int(n.xpath("string(.)"));
high_money=str(max);
# 公司名字
company = info.xpath("//ul[@class='contact-box']/li[@class='clearfix contact-li'][1]/div[@class='val']")[0]
company = company.xpath("string(.)")
# 发起数目
start=info.xpath("//div[@class='promoters-num']/div[@class='fl start']/span[@class='num']")[0]
start = start.xpath("string(.)")
# 支持数目
donate=info.xpath("//div[@class='promoters-num']/div[@class='fl']/span[@class='num']")[0]
donate = donate.xpath("string(.)")
# 价格档次和对应价格
if "公司" in company:
company_type="1"
else:
company_type="0"
print(company)
print(intro)
print(tele)
# shop_name = item.xpath("./div/div[@class='p-shop']//a/text()")
# if shop_name:
# shop_link = "http:" + item.xpath("./div/div[@class='p-shop']//a/@href")[0]
# else:
# shop_name = ['京东自营']
'''
print('\n商品' + str(count) + ':')
print(price)
print(rate)
print(link)
print(left_time)
print(attention)
print(support)
print(status)
'''
# print title
# print price[0]
# print comment[0]
# print link
# print shop_name[0]
# print shop_link
# 查找数据库中是否存在当前商品的链接
serch_str = "select * from jdCompany where link='%s';" % link
ser_result = conn.query(serch_str)
# 商品信息存入数据库
if not ser_result:
print('开始存储')
save_str = "insert into jdCompany(title,link,company,companyType,intro,tele,teletype,high_money,start,donate,catchdate) " \
"values('" + title + "','" + link + "','" + company +"','" + company_type+ "','" + intro +"','" +tele\
+ "','"+ tele_type + "','"+ high_money + "','"+start + "','"+donate + "','"+\
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') +"');"
save_result = conn.query(save_str)
conn.commit()
print(title, '存储成功')
else:
print("商品已存在")
print('-------------------------------------------------------')
count += 1
except Exception as e:
print(e)
print(traceback.format_exc())
print(title)
# 关闭数据库,关闭当前页面,退出phantomjs
conn.close()
driver.close()
driver.quit()
print('第' + str(page_num) + '页', '共' + str(count) + '条记录')
# 主入口函数
if __name__ == '__main__':
# 定义要查找的关键字,并转换成url地址参数
key = quote('')
# 定义进程池,同时运行的进程数量为4个
po_li = Pool(1)
# 初始化进程
for x in range(1, 21):
print('开始第' + str(x) + '页的进程')
t = po_li.apply_async(get_goods, (key, x,))
# 关闭进程池
po_li.close()
po_li.join()
京东商品信息爬取:
# -*- coding: utf-8 -*-__author__ = 'EasouChen'
# 导入以下模块
# selenium用于结合phantomjs
from selenium import webdriver
import time
import traceback
import datetime
from lxml import etree
# 底下这行用于自定义头部文件
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import pymysql
# 多进程池,用于多进程
from multiprocessing import Pool
# 使用该函数将中文转换成url参数
from urllib.parse import quote
# 这三行用于解决mysql报ascii无法decode的问题,意思是将所有字符格式default为'utf-8'
# import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')
# 定义函数,参数为页数
def get_goods(key, page_num,dbname):
'''
用于爬取京东'零食类商品信息,包括标题,价格,评论,详细页链接,店名,店铺链接'
:param key: 爬取的关键字,如零食
:param page_num: 第几页
:return:
'''
# 链接数据库
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='jd', use_unicode=True, charset="utf8")
# 自定义userAgent,并使用该参数访问页面
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# https://zc.suning.com/project/browseList.htm?c=&t=&s=02&keyWords=%E8%AF%B7%E8%BE%93%E5%85%A5%E5%85%B3%E9%94%AE%E5%AD%97
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(chrome_options=chrome_options)
# status=2标识众筹中
# page是页数
driver.get('https://z.jd.com/bigger/search.html?from=zchome&status=2&page=%s' % (page_num))
# dcap = dict(DesiredCapabilities.PHANTOMJS)
# dcap['phantomjs.page.settings.userAgent'] = (
# "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0")
# driver = webdriver.PhantomJS(desired_capabilities=dcap)
# 注意链接经过url处理,原因是phantomjs对url中的中文识别成了??,无法正常处理
# 该网址中的%E9%9B%B6%E9%A3%9F 可自定义搜索条件,并使用urllib转换
# driver.get('https://search.jd.com/Search?keyword=%s&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.his.0.0' % (
# key) + '&page=%s&s=57&click=0' % (page_num * 2 - 1))
# 打开页面后,等待两秒,下拉到页面底部,做二次加载
js = "window.scrollTo(0,document.body.scrollHeight);"
time.sleep(1)
driver.execute_script(js)
time.sleep(1)
# 将网页数据初始化,用lxml模块处理
htmls = etree.HTML(driver.page_source)
# 获取商品列表
goods_list = htmls.xpath("//div[@class='l-info']//div[@class='l-result']"
"/ul[@class='infos clearfix']/li[@class='info type_now']")
# print(goods_list)
count = 1
for item in goods_list:
# 遍历商品列表,从列表中得到每个商品的具体信息
try:
link = "https://z.jd.com" + item.xpath("./div[@class='i-tits no-color-choose'or'i-tits ']/a/@href")[0]
price = item.xpath("./div[@class='p-outter']/div[@class='p-items']"
"/ul[@class='p-i-infos clearfix']/li[@class='fore2']/p[@class='p-percent']")[0]
price = price.xpath("string(.)")
price = price.replace('(', '<').replace(')', '>')
rate = item.xpath("./div[@class='p-outter']/div[@class='p-items']"
"/ul[@class='p-i-infos clearfix']/li[@class='fore1']/p[@class='p-percent']")[0]
rate = rate.xpath("string(.)")
rate = rate.replace('(', '<').replace(')', '>')
# 剩余时间
left_time = item.xpath("./div[@class='p-outter']/div[@class='p-items']"
"/ul[@class='p-i-infos clearfix']/li[@class='fore3']/p[@class='p-percent']")[0]
left_time = left_time.xpath("string(.)")
left_time = left_time.replace('\r', '').replace('\n', '').replace('\t', '')
left_time = left_time.replace('(', '<').replace(')', '>')
driver.get(link)
# 将网页数据初始化,用lxml模块处理
info = etree.HTML(driver.page_source)
#名字
title = info.xpath("//div[@class='project-introduce']/h1[@class='p-title']")[0]
title = title.xpath("string(.)")
title = title.replace('(', '<').replace(')', '>')
print(title)
# 支持者人数
support = info.xpath("//div[@class='project-introduce']//p[@class='p-progress']/span[@class='fr']")[0]
support = support.xpath("string(.)")
support = support.replace('(', '<').replace(')', '>')
# ddl
deadline = info.xpath("//div[@class='project-introduce']//p[@class='p-target']/span[@class='f_red']")[0]
deadline = deadline.xpath("string(.)")
deadline = deadline.replace('\r', '').replace('\n', '').replace('\t', '').replace(' ','')
deadline = deadline.replace('(', '<').replace(')', '>')
# 关注人数
attention = info.xpath("//div[@class='project-introduce']//p[@class='p-btns']/"
"a[@id='a_focus']/span[@class='num']")[0]
attention = attention.xpath("string(.)")
attention = attention.replace('(', '<').replace(')', '>')
# 点赞人数
prais = info.xpath("//div[@class='project-introduce']//p[@class='p-btns']/"
"a[@id='a_prais']/span[@class='num']")[0]
prais = prais.xpath("string(.)")
prais = prais.replace('(', '<').replace(')', '>')
# 目前状态
status = "众筹中"
# shop_name = item.xpath("./div/div[@class='p-shop']//a/text()")
# if shop_name:
# shop_link = "http:" + item.xpath("./div/div[@class='p-shop']//a/@href")[0]
# else:
# shop_name = ['京东自营']
# print('\n商品' + str(count) + ':')
# print(price)
# print(rate)
# print(link)
# print(left_time)
# print(attention)
# print(support)
# print(status)
# print(prais)
# print title
# print price[0]
# print comment[0]
# print link
# print shop_name[0]
# print shop_link
# 查找数据库中是否存在当前商品的链接
serch_str = "select * from %s "% dbname+"where link='%s';" % link
ser_result = conn.query(serch_str)
# 商品信息存入数据库
if not ser_result:
#('开始存储')
save_str = "insert into %s"%dbname+"(title,price,rate,link,left_time,attention,support,status,prais,catchdate) " \
"values('" + title + "','" + price + "','" + rate + "','" + link \
+ "','" + left_time + "','" + attention + "','" + support + "','" + status + "','" + prais + "','" + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') +"');"
save_result = conn.query(save_str)
conn.commit()
# print(title, '存储成功')
else:
print("商品已存在")
# print('-------------------------------------------------------')
count += 1
except Exception as e:
print(e)
print(traceback.format_exc())
print(title)
# 关闭数据库,关闭当前页面,退出phantomjs
conn.close()
driver.close()
driver.quit()
# print('第' + str(page_num) + '页', '共' + str(count) + '条记录')
# 主入口函数
if __name__ == '__main__':
# 定义要查找的关键字,并转换成url地址参数
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='jd', use_unicode=True, charset="utf8")
dbname = "jdzhongchou" + datetime.datetime.now().strftime('%Y%m%d%H');
print(dbname)
create_database = "create table %s(id INT NOT NULL AUTO_INCREMENT PRIMARY KEY," \
"title VARCHAR(100) NOT NULL ,price VARCHAR(20) NOT NULL,rate VARCHAR(10) ,link VARCHAR(300) NOT NULL," \
"left_time VARCHAR(10) NOT NULL,attention VARCHAR(30),support VARCHAR(30) NOT NULL," \
"status VARCHAR(10) NOT NULL,prais VARCHAR(10) NOT NULL,catchdate VARCHAR(20) NOT NULL)engine=InnoDB default " \
"charset=utf8;" % dbname
conn.query(create_database)
conn.commit()
conn.close()
key = quote('')
# 定义进程池,同时运行的进程数量为4个
po_li = Pool(2)
# 初始化进程
for x in range(1, 21):
print('开始第' + str(x) + '页的进程')
t = po_li.apply_async(get_goods, (key, x,dbname,))
# 关闭进程池
po_li.close()
po_li.join()
苏宁公司信息:
# -*- coding: utf-8 -*-__author__ = 'EasouChen'
# 导入以下模块
# selenium用于结合phantomjs
from selenium import webdriver
import traceback
import datetime
import time
from lxml import etree
# 底下这行用于自定义头部文件
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import pymysql
# 多进程池,用于多进程
from multiprocessing import Pool
# 使用该函数将中文转换成url参数
from urllib.parse import quote
# 这三行用于解决mysql报ascii无法decode的问题,意思是将所有字符格式default为'utf-8'
# import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')
# 定义函数,参数为页数
def get_goods(key, page_num):
'''
用于爬取商品信息,包括标题,价格,评论,详细页链接,店名,店铺链接'
:param key: 爬取的关键字,如
:param page_num: 第几页
:return:
'''
# 链接数据库
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='sn', use_unicode=True, charset="utf8")
# 自定义userAgent,并使用该参数访问页面
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# https://zc.suning.com/project/browseList.htm?c=&t=&s=02&keyWords=%E8%AF%B7%E8%BE%93%E5%85%A5%E5%85%B3%E9%94%AE%E5%AD%97
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get('https://zc.suning.com/project/browseList.htm?c=&t=02&s=&keyWords=&pageNumber=%s'%(page_num))
# dcap = dict(DesiredCapabilities.PHANTOMJS)
# dcap['phantomjs.page.settings.userAgent'] = (
# "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0")
# driver = webdriver.PhantomJS(desired_capabilities=dcap)
# 注意链接经过url处理,原因是phantomjs对url中的中文识别成了??,无法正常处理
# 该网址中的%E9%9B%B6%E9%A3%9F 可自定义搜索条件,并使用urllib转换
# driver.get('https://search.jd.com/Search?keyword=%s&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.his.0.0' % (
# key) + '&page=%s&s=57&click=0' % (page_num * 2 - 1))
# 打开页面后,等待两秒,下拉到页面底部,做二次加载
js = "window.scrollTo(0,document.body.scrollHeight);"
time.sleep(2)
driver.execute_script(js)
time.sleep(4)
# 将网页数据初始化,用lxml模块处理
htmls = etree.HTML(driver.page_source)
# 获取商品列表
goods_list = htmls.xpath("//div //*[@class='item-list'] /ul/li")
# print(goods_list)
count = 1
for item in goods_list:
# 遍历商品列表,从列表中得到每个商品的具体信息
try:
# https://zc.suning.com/project/detail.htm?projectId=22027
# title1 = item.xpath("./div/div[contains(@class,'p-name')]/a/em")[0]
title1 = item.xpath("./div[@class='item-info']/p/a")[0]
title = title1.xpath("string(.)")
# title = item.xpath("./div[@class='item-info']/p/a/text()")
print(title)
title = title.replace('(', '<').replace(')', '>')
link = "https://zc.suning.com/" + item.xpath("./div[@class='item-info']/p/a/@href")[0]
print(link)
driver.get(link)
# 将网页数据初始化,用lxml模块处理
info = etree.HTML(driver.page_source)
#公司介绍
intro=info.xpath("//div[@class='item-organizer box']/p[2]")[0]
intro = intro.xpath("string(.)")
# 联系电话
tele=info.xpath("//div[@class='item-organizer box']/p[3]")[0]
tele = tele.xpath("string(.)")
if "400" in tele:
tele_type="1"
else:
tele_type="0"
# 最高金额
high_money = info.xpath("//span/strong[@class='price']")[-1]
high_money = high_money.xpath("string(.)")
# 公司名字
if not info.xpath("//div[@class='item-organizer box']/p[1]/@title"):
company = info.xpath("//div[@class='item-organizer box']/p[1]/text()")[0]
else:
company = info.xpath("//div[@class='item-organizer box']/p[1]/@title")[0]
company.encode('utf-8')
if "公司" in company:
company_type="1"
else:
company_type="0"
print(company)
print(intro)
print(tele)
# shop_name = item.xpath("./div/div[@class='p-shop']//a/text()")
# if shop_name:
# shop_link = "http:" + item.xpath("./div/div[@class='p-shop']//a/@href")[0]
# else:
# shop_name = ['京东自营']
'''
print('\n商品' + str(count) + ':')
print(price)
print(rate)
print(link)
print(left_time)
print(attention)
print(support)
print(status)
'''
# print title
# print price[0]
# print comment[0]
# print link
# print shop_name[0]
# print shop_link
# 查找数据库中是否存在当前商品的链接
serch_str = "select * from snCompany where link='%s';" % link
ser_result = conn.query(serch_str)
# 商品信息存入数据库
if not ser_result:
print('开始存储')
save_str = "insert into snCompany(title,link,company,companyType,intro,tele,teletype,high_money,catchdate) " \
"values('" + title + "','" + link + "','" + company +"','" + company_type+ "','" + intro +"','" +\
tele + "','"+ tele_type +"','"+ high_money + "','"+ datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') +"');"
save_result = conn.query(save_str)
conn.commit()
print(title, '存储成功')
else:
print("商品已存在")
print('-------------------------------------------------------')
count += 1
except Exception as e:
print(e)
print(traceback.format_exc())
print(title)
# 关闭数据库,关闭当前页面,退出phantomjs
conn.close()
driver.close()
driver.quit()
print('第' + str(page_num) + '页', '共' + str(count) + '条记录')
# 主入口函数
if __name__ == '__main__':
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='jd', use_unicode=True, charset="utf8")
dbname = "snCompany" + datetime.datetime.now().strftime('%Y%m%d%H');
print(dbname)
create_database = "create table %s(id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,title VARCHAR(100) NOT NULL ," \
"link VARCHAR(300) NOT NULL,company VARCHAR(40) NOT NULL,companyType INT NOT NULL,intro VARCHAR(100) NOT NULL," \
"tele VARCHAR(20) ,teletype INT ,high_money VARCHAR(10) NOT NULL,catchdate VARCHAR(20) NOT NULL)" \
"engine=InnoDB default charset=utf8;" % dbname
conn.query(create_database)
conn.commit()
conn.close()
# 定义要查找的关键字,并转换成url地址参数
key = quote('')
# 定义进程池,同时运行的进程数量为4个
po_li = Pool(1)
# 初始化进程
for x in range(1, 7):
print('开始第' + str(x) + '页的进程')
t = po_li.apply_async(get_goods, (key, x,))
# 关闭进程池
po_li.close()
po_li.join()
苏宁商品信息:
# -*- coding: utf-8 -*-__author__ = 'EasouChen'
# 导入以下模块
# selenium用于结合phantomjs
from selenium import webdriver
import traceback
import datetime
import time
from lxml import etree
# 底下这行用于自定义头部文件
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import pymysql
# 多进程池,用于多进程
from multiprocessing import Pool
# 使用该函数将中文转换成url参数
from urllib.parse import quote
# 这三行用于解决mysql报ascii无法decode的问题,意思是将所有字符格式default为'utf-8'
# import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')
# 定义函数,参数为页数
def get_goods(key, page_num,dbname):
'''
用于爬取京东'零食类商品信息,包括标题,价格,评论,详细页链接,店名,店铺链接'
:param key: 爬取的关键字,如零食
:param page_num: 第几页
:return:
'''
# 链接数据库
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='sn', use_unicode=True, charset="utf8")
# 自定义userAgent,并使用该参数访问页面
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# https://zc.suning.com/project/browseList.htm?c=&t=&s=02&keyWords=%E8%AF%B7%E8%BE%93%E5%85%A5%E5%85%B3%E9%94%AE%E5%AD%97
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get('https://zc.suning.com/project/browseList.htm?c=&t=02&s=&keyWords=&pageNumber=%s'%(page_num))
# dcap = dict(DesiredCapabilities.PHANTOMJS)
# dcap['phantomjs.page.settings.userAgent'] = (
# "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0")
# driver = webdriver.PhantomJS(desired_capabilities=dcap)
# 注意链接经过url处理,原因是phantomjs对url中的中文识别成了??,无法正常处理
# 该网址中的%E9%9B%B6%E9%A3%9F 可自定义搜索条件,并使用urllib转换
# driver.get('https://search.jd.com/Search?keyword=%s&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.his.0.0' % (
# key) + '&page=%s&s=57&click=0' % (page_num * 2 - 1))
# 打开页面后,等待两秒,下拉到页面底部,做二次加载
js = "window.scrollTo(0,document.body.scrollHeight);"
time.sleep(2)
driver.execute_script(js)
time.sleep(4)
# 将网页数据初始化,用lxml模块处理
htmls = etree.HTML(driver.page_source)
# 获取商品列表
goods_list = htmls.xpath("//div //*[@class='item-list'] /ul/li")
# print(goods_list)
count = 1
for item in goods_list:
# 遍历商品列表,从列表中得到每个商品的具体信息
try:
# https://zc.suning.com/project/detail.htm?projectId=22027
# title1 = item.xpath("./div/div[contains(@class,'p-name')]/a/em")[0]
title1 = item.xpath("./div[@class='item-info']/p/a")[0]
title = title1.xpath("string(.)")
# title = item.xpath("./div[@class='item-info']/p/a/text()")
print(title)
title = title.replace('(', '<').replace(')', '>')
price = item.xpath("./div[@class='item-info']/div[@class='item-num'][2]/span[2]/strong")[0]
price = price.xpath("string(.)")
price = price.replace('(', '<').replace(')', '>')
rate = item.xpath("./div[@class='item-info']/div[@class='item-num']"
"/span[@class='fr item-finish']/strong")[0]
rate = rate.xpath("string(.)")
rate = rate.replace('(', '<').replace(')', '>')
link = "https://zc.suning.com/" + item.xpath("./div[@class='item-info']/p/a/@href")[0]
# 剩余时间
left_time = item.xpath("./div[@class='item-info']/div[@class='item-num']/span[@class='fr']/b")[0]
left_time = left_time.xpath("string(.)")
left_time = left_time.replace('\r', '').replace('\n', '').replace('\t', '')
left_time = left_time.replace('(', '<').replace(')', '>')
# 关注人数
attention = item.xpath("./div[@class='item-info']/div[@class='item-num']/span[2]/b")[0]
attention = attention.xpath("string(.)")
attention = attention.replace('(', '<').replace(')', '>')
# 支持人数
support = item.xpath("./div[@class='item-info']/div[@class='item-num']/span[@class='ml30']/b")[0]
support = support.xpath("string(.)")
support = support.replace('(', '<').replace(')', '>')
# 目前状态
status = item.xpath("./div[@class='item-info']/div[@class='item-status']")[0]
status = status.xpath("string(.)")
status = status.replace('\r', '').replace('\n', '').replace('\t', '')
status = status.replace('(', '<').replace(')', '>')
# shop_name = item.xpath("./div/div[@class='p-shop']//a/text()")
# if shop_name:
# shop_link = "http:" + item.xpath("./div/div[@class='p-shop']//a/@href")[0]
# else:
# shop_name = ['京东自营']
print('\n商品' + str(count) + ':')
print(price)
print(rate)
print(link)
print(left_time)
print(attention)
print(support)
print(status)
# print title
# print price[0]
# print comment[0]
# print link
# print shop_name[0]
# print shop_link
# 查找数据库中是否存在当前商品的链接
serch_str = "select * from %s" % dbname+" where link='%s';" % link
ser_result = conn.query(serch_str)
# 商品信息存入数据库
if not ser_result:
print('开始存储')
save_str = "insert into %s"% dbname+"(title,price,rate,link,left_time,attention,support,status,catchdate) " \
"values('" + title + "','" + price + "','" + rate + "','" + link \
+ "','" + left_time + "','" + attention + "','"+ support +"','"+ status + "','"+ datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') +"');"
save_result = conn.query(save_str)
conn.commit()
print(title, '存储成功')
else:
print("商品已存在")
print('-------------------------------------------------------')
count += 1
except Exception as e:
print(e)
print(traceback.format_exc())
print(title)
# 关闭数据库,关闭当前页面,退出phantomjs
conn.close()
driver.close()
driver.quit()
print('第' + str(page_num) + '页', '共' + str(count) + '条记录')
# 主入口函数
if __name__ == '__main__':
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='sn', use_unicode=True, charset="utf8")
dbname = "snzhongchou" + datetime.datetime.now().strftime('%Y%m%d%H');
print(dbname)
create_database = "create table %s(id INT NOT NULL AUTO_INCREMENT PRIMARY KEY," \
"title VARCHAR(100) NOT NULL ,price VARCHAR(20) NOT NULL,rate VARCHAR(10) ,link VARCHAR(300) NOT NULL," \
"left_time VARCHAR(10) NOT NULL,attention VARCHAR(30),support VARCHAR(30) NOT NULL," \
"status VARCHAR(10) NOT NULL,catchdate VARCHAR(20) NOT NULL)engine=InnoDB default charset=utf8;" % dbname
conn.query(create_database)
conn.commit()
conn.close()
# 定义要查找的关键字,并转换成url地址参数
key = quote('')
# 定义进程池,同时运行的进程数量为4个
po_li = Pool(2)
# 初始化进程
for x in range(1, 7):
print('开始第' + str(x) + '页的进程')
t = po_li.apply_async(get_goods, (key, x,dbname,))
# 关闭进程池
po_li.close()
po_li.join()
还有一些整理成excel格式的工具文档:
txtCalCulateSet.py
import xlrd # 写入文件
import xlutils.copy
import os
import re
def txt2excel(path, title, i):
fopen = open(path + '/' + title, 'r', encoding='utf-8')
lines = fopen.readlines()
# 新建一个excel文件
file = xlrd.open_workbook("D:\\database\\dealEndOK20190409\\123xx10.xls")
ws = xlutils.copy.copy(file)
sheet = ws.get_sheet(0)
# 新建一个sheet
############################
# 写入写入a.txt
# print(lines[1])
# if " 1% "|" 2% "|" 3% "|" 4% "|" 5% "|" 6% "|" 7% "|" 8% "|" 9% "|" 10% " in lines[1]:
# if (lines[1].find(" 0%")>=0 or lines[1].find(" 1%")>=0 or lines[1].find(" 2%")>=0 or lines[1].find(" 3%")>=0) or lines[1].find(" 4%")>=0 or lines[1].find(" 5%")>=0 or lines[1].find(" 6%")>=0 or lines[1].find(" 7%")>=0 or lines[1].find(" 8%")>=0 or lines[1].find(" 9%")>=0 or lines[1].find(" 10%")>=0 and len(lines)>120:#:
# print(lines[1].find(" 0%")>=0)
# print(lines[1].find(" 1%")>=0)
# print(lines[1].find(" 2%")>=0)
# print(lines[1].find(" 3%")>=0)
# print(lines[1].find(" 4%")>=0)
# print(lines[1].find(" 5%")>=0)
# print(lines[1].find(" 6%")>=0)
# print(lines[1].find(" 7%")>=0)
# print(lines[1].find(" 8%")>=0)
# print(lines[1].find(" 9%")>=0)
# print(lines[1].find(" 10%")>=0)
# print(lines[1].find(" 11%")>=0)
count=0
deal = lines[len(lines) - 3].replace("\t", "|")
deal=deal.replace("¥"," ")
deal = deal.replace("¥", " ")
deal = deal.replace(" ", " ")
deal = deal.replace(" ", " ")
deal = deal.replace(" ", " ")
deal = deal.replace(" ", " ")
deal = deal.replace(" ", " ")
deal = deal.replace(" ", "|")
dealList = deal.split("|")
print(deal)
decend0=0;
decend1=1220;
count=0
sheet.write(i, 0, dealList[1])
num=0
for line in dealList:
count=count+1;
if line.isdigit() and count>1 and count<=6:
num = int(line)
print(line)
if line.find("%") >= 0:
finish = line.split("%")
# print(lines[len(lines) - 3])
#print("finish" + finish[0])
isfinished = int(finish[0])
decend0 = isfinished
if num==0 or isfinished==0:
sheet.write(i, 4, str(0))
else:
sheet.write(i, 4, str(num/isfinished))
if len(lines) > 10 and (lines[len(lines) - 3].find("小时") >= 0 or lines[len(lines) - 3].find("剩余1天") >= 0
or lines[len(lines) - 3].find("剩余2天") >= 0 or lines[len(lines) - 3].find("剩余3天") >= 0 or lines[
len(lines) - 3].find("剩余2天") >= 0 or lines[len(lines) - 3].find(" 0天") >= 0 or lines[
len(lines) - 3].find(
" 3天") >= 0 or lines[len(lines) - 3].find(
" 4天") >= 0 or lines[len(lines) - 3].find(
" 1天") >= 0 or lines[len(lines) - 3].find(
" 2天") >= 0) and (isfinished > 90)and len(lines) > 120:
sheet.write(i, 5, "xx") # 有操纵
'''
'''
if isfinished>=120:
sheet.write(i, 1, 0)# 有操纵
if isfinished<120 :
sheet.write(i, 1, 1)
#print("yes!!----i:" + str(i))
if decend0 > decend1:
print(str(decend0)+":::"+str(decend1))
count=count+1
sheet.write(i, 1+count, count);
decend1 = isfinished
ws.save('D:/database/dealEndOK20190409/123xx10.xls')
'''
for line in dealList:
if line.find("%")>=0:
finish = line.split("%")
print(lines[len(lines) - 3])
print("finish"+finish[0])
isfinished=int(finish[0]);
if (lines[len(lines) - 3].find("小时") >= 0 or lines[len(lines) - 3].find("剩余1天") >= 0
or lines[len(lines) - 3].find("剩余2天") >= 0 or lines[len(lines) - 3].find("剩余3天") >= 0 or lines[
len(lines) - 3].find("剩余2天") >= 0 or lines[len(lines) - 3].find(" 0天") >= 0 or lines[len(lines) - 3].find(
" 3天") >= 0 or lines[len(lines) - 3].find(
" 4天") >= 0 or lines[len(lines) - 3].find(
" 1天") >= 0 or lines[len(lines) - 3].find(
" 2天") >= 0) and (isfinished>90) and len(lines)>120:
# if lines[len(lines)-3].find("小时")>=0 or lines[len(lines)-3].find("剩余1天")>=0 or lines[len(lines)-3].find("剩余2天")>=0 or lines[len(lines)-3].find(" 0天")>=0 or lines[len(lines)-3].find(" 1天")>=0 or lines[len(lines)-3].find(" 2天")>=0 :#or lines[len(lines)-3].find(" 3天")>=0 or lines[len(lines)-3].find(" 4天")>=0 or lines[len(lines)-3].find(" 5天")>=0:
print(lines[len(lines) - 3]);
#print(lines[len(lines)-3].find("小时") >= 0)
#print(lines[len(lines)-3].find("0天") >= 0)
for line in lines:
deal = line.replace("\t", "|")
deal = deal.replace(" ", "|")
deal = deal.replace(" ", "|")
deal = deal.replace(" ", "|")
deal = deal.replace(" ", "|")
dealList = deal.split("|")
j = 0
for item in dealList:
sheet.write(i, j, item)
j = j + 1;
i = i + 1
sheet.write(j, 1, isfinished)
file.save('D:/database/dealEndOK100/123xx.xls')
'''
#################################
'''
#第二层执行代码,写入b.txt,
j=1 #从20001行写入
fopen2=open("D:\database\deal\故宫白玉小金猪,诸事顺猪年旺.txt",'r',encoding='utf-8')
lines2=fopen2.readlines()
for line in lines2:
sheet.write(j,0,line)
j=j+1
'''
def printPath(level, path):
global allFileNum
i = 0
'''''
打印一个目录下的所有文件夹和文件
'''
# 所有文件夹,第一个字段是次目录的级别
dirList = []
# 所有文件
fileList = []
# 返回一个列表,其中包含在目录条目的名称(google翻译)
files = os.listdir(path)
# 先添加目录级别
dirList.append(str(level))
for f in files:
if (os.path.isdir(path + '/' + f)):
# 排除隐藏文件夹。因为隐藏文件夹过多
if (f[0] == '.'):
pass
else:
# 添加非隐藏文件夹
dirList.append(f)
if (os.path.isfile(path + '/' + f)):
# 添加文件
fileList.append(f)
# 当一个标志使用,文件夹列表第一个级别不打印
i_dl = 0
for dl in dirList:
if (i_dl == 0):
i_dl = i_dl + 1
# else:
# 打印至控制台,不是第一个的目录
# print('-' * (int(dirList[0])), dl)
# 打印目录下的所有文件夹和文件,目录级别+1
# printPath((int(dirList[0]) + 1), path + '/' + dl)
for fl in fileList:
# 打印文件
# print('-' * (int(dirList[0])), fl)
# 随便计算一下有多少个文件
# allFileNum = allFileNum + 1
f = open(path + '/' + fl, 'r', encoding='utf-8')
txt2excel(path, fl, i)
i = i + 1
'''
fileName = re.sub('[\/:*?"<>|]', '-', title) # 去掉非法字符
w = open('D:/database/deal/'+fileName+'.txt', 'a+',encoding='utf-8')
lines = f.readlines()
for line in lines:
if title in line:
w.write(line);
break;
'''
f.close();
if __name__ == '__main__':
printPath(1, 'D:/database/deal')
转公司名称:
#!/usr/bin/python
# -*- coding:utf8 -*-
import os
import re
allFileNum = 0
def printPath(level, path,title):
global allFileNum
'''''
打印一个目录下的所有文件夹和文件
'''
# 所有文件夹,第一个字段是次目录的级别
dirList = []
# 所有文件
fileList = []
# 返回一个列表,其中包含在目录条目的名称(google翻译)
files = os.listdir(path)
# 先添加目录级别
dirList.append(str(level))
for f in files:
if (os.path.isdir(path + '/' + f)):
# 排除隐藏文件夹。因为隐藏文件夹过多
if (f[0] == '.'):
pass
else:
# 添加非隐藏文件夹
dirList.append(f)
if (os.path.isfile(path + '/' + f)):
# 添加文件
fileList.append(f)
# 当一个标志使用,文件夹列表第一个级别不打印
i_dl = 0
for dl in dirList:
if (i_dl == 0):
i_dl = i_dl + 1
else:
# 打印至控制台,不是第一个的目录
#print('-' * (int(dirList[0])), dl)
# 打印目录下的所有文件夹和文件,目录级别+1
printPath((int(dirList[0]) + 1), path + '/' + dl,title)
for fl in fileList:
# 打印文件
#print('-' * (int(dirList[0])), fl)
# 随便计算一下有多少个文件
allFileNum = allFileNum + 1
f = open(path+'/'+fl, 'r',encoding='utf-8')
fileName = re.sub('[\/:*?"<>|]', '-', title) # 去掉非法字符
w = open('D:/database/company/'+fileName+'.txt', 'a+',encoding='utf-8')
lines = f.readlines()
for line in lines:
if title in line:
w.write(line);
break;
w.close();
f.close();
if __name__ == '__main__':
# f = open("D:\database\send\公司信息", 'r', encoding='utf-8')
titles = ['jd400.xls','sn400.xls'];
for title in titles:
f = open('D:/database/company' + '/' + title, 'r', encoding='utf-8')
fileName = re.sub('[\/:*?"<>|]', '-', title) # 去掉非法字符
w = open('D:/database/company/' + fileName + '.txt', 'a+', encoding='utf-8')
lines = f.readlines()
for line in lines:
w.write(line);
w.close();
f.close();
print(title)
printPath(1, 'D:/database/send/公司信息', title.strip())
print('总文件数 =', allFileNum)
# f.close()
转excel。
# coding=utf-8
'''
main function:主要实现把txt中的每行数据写入到excel中
'''
#################
#第一次执行的代码
import xlwt #写入文件
import xlrd #打开excel文件
fopen=open("D:/database/company/snSelect85.txt",'r',encoding='utf-8')
lines=fopen.readlines()
#新建一个excel文件
file=xlwt.Workbook(encoding='utf-8',style_compression=0)
#新建一个sheet
sheet=file.add_sheet('data')
############################
#写入写入a.txt
i=0
for line in lines:
deal=line.replace("\t","|")
deal=deal.replace(" ","|")
deal=deal.replace(" ","|")
deal=deal.replace(" ","|")
deal=deal.replace(" ","|")
dealList=deal.split("|")
j=0
str=""
len(line)
p = 0;
print(dealList)
sheet.write(i, j, dealList[0])
for item in dealList:
item = item.replace("\n", "")
item = item.replace(".00", "")
if item.isdigit():
j = j + 1
sheet.write(i,j,item)
print("yes")
#sheet.write(i, 0, str)
i=i+1
#################################
'''
#第二层执行代码,写入b.txt,
j=1 #从20001行写入
fopen2=open("D:\database\deal\故宫白玉小金猪,诸事顺猪年旺.txt",'r',encoding='utf-8')
lines2=fopen2.readlines()
for line in lines2:
sheet.write(j,0,line)
j=j+1
'''
file.save('D:/database/company/snSelect85.xls')