暑假在家帮家里写了点小代码,分享一下同时也备份一下
拼多多好像是反爬比较聪明老是爬不到,还是我太菜了
淘宝
# -*- coding: utf-8 -*-
import requests
import re
import pandas as pd
import time
import xlwt
import os
# 此处写入登录之后自己的cookies
cookie = input('请输入想查询的商品的cookie:'.strip())
# 获取页面信息
def getHTMLText(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'}
user_cookies = cookie
cookies = {}
for a in user_cookies.split(';'): # 因为cookies是字典形式,所以用spilt函数将之改为字典形式
name, value = a.strip().split('=', 1)
cookies[name] = value
try:
r = requests.get(url, cookies=cookies, headers=headers, timeout=60)
print(r.status_code)
print(r.cookies)
return r.text
except:
print('获取页面信息失败')
return ''
# 格式化页面,查找数据
def parsePage(html):
list = []
try:
views_title = re.findall('"raw_title":"(.*?)","pic_url"', html)
print(len(views_title)) # 打印检索到数据信息的个数,如果此个数与后面的不一致,则数据信息不能加入列表
print(views_title)
views_price = re.findall('"view_price":"(.*?)","view_fee"', html)
print(len(views_price))
print(views_price)
item_loc = re.findall('"item_loc":"(.*?)","view_sales"', html)
print(len(item_loc))
print(item_loc)
views_sales = re.findall('"view_sales":"(.*?)","comment_count"', html)
print(len(views_sales))
print(views_sales)
comment_count = re.findall('"comment_count":"(.*?)","user_id"', html)
print(len(comment_count))
print(comment_count)
shop_name = re.findall('"nick":"(.*?)","shopcard"', html)
print(len(shop_name))
for i in range(len(views_price)):
list.append([views_title[i], views_price[i], item_loc[i], comment_count[i], views_sales[i], shop_name[i]])
# print(list)
print('爬取数据成功')
return list
except:
print('有数据信息不全,如某一页面中某一商品缺少地区信息')
# 存储到csv文件中,为接下来的数据分析做准备
def save_to_file(list):
data = pd.DataFrame(list)
data.to_csv('C:\\Users\\Administrator\\Desktop\\商品数据.csv', header=False, mode='a+') # 用追加写入的方式
# csv转存为excel
def txt_xls(filename, xlsname):
try:
f = open(filename, 'r',encoding='utf-8')
xls = xlwt.Workbook()
sheet =xls.add_sheet('sheet1',cell_overwrite_ok=True)
x=0
while True:
line =f.readline()
if not line:
break
#split(';')表示csv文件内容以“;”分割
for i in range(len(line.split(';'))):
item = line.split(';')[i]
sheet.write(x,i,item)
x+=1
f.close()
xls.save(xlsname)
except:
raise
def main():
name = [['views_title', 'views_price', 'item_loc', 'comment_count', 'views_sales', 'shop_name']]
data_name = pd.DataFrame(name)
data_name.to_csv('C:\\Users\\Administrator\\Desktop\\商品数据.csv', header=False, mode='a+') # 提前保存一行列名称
goods = input('请输入想查询的商品名称:'.strip()) # 输入想搜索的商品名称
print('注意需要输入和上面cookie所对应的商品名称')
depth = 5 # 爬取的页数
start_url = 'http://s.taobao.com/search?q=' + goods # 初始搜索地址
for i in range(depth):
time.sleep(3 + i)
try:
page = i + 1
print('桐:正在爬取第%s页数据' % page)
url = start_url + 'imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20200408&ie=utf8&sort=sale-desc&bcoffset=0&p4ppushleft=%2C44&s=' + str(44 * i)
html = getHTMLText(url)
# print(html)
list = parsePage(html)
save_to_file(list)
except:
print('数据没保存成功')
if __name__ == '__main__':
main()
filename = "C:\\Users\\Administrator\\Desktop\\商品数据.csv"
xlsname ="C:\\Users\\Administrator\\Desktop\\商品数据.xls"
txt_xls(filename,xlsname)
os.remove(filename)
京东
import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from lxml import etree
from openpyxl import Workbook
wb = Workbook()
sheet = wb.active
sheet['A1'] = 'name'
sheet['B1'] = 'price'
sheet['C1'] = 'commit'
sheet['D1'] = 'shop'
sheet['E1'] = 'sku'
sheet['F1'] = 'icons'
sheet['G1'] = 'detail_url'
driver_path = r"C:\Users\Administrator\Desktop\chromedriver.exe"
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
options.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])
driver = webdriver.Chrome(chrome_options=options)
# 不加载图片
options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2})
driver = webdriver.Chrome(executable_path=driver_path, options=options)
wait = WebDriverWait(driver, 60) # 设置等待时间
def search(keyword):
try:
input = wait.until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#key"))
) # 等到搜索框加载出来
submit = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "#search > div > div.form > button"))
) # 等到搜索按钮可以被点击
input[0].send_keys(keyword) # 向搜索框内输入关键词
submit.click() # 点击
wait.until(
EC.presence_of_all_elements_located(
(By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > em:nth-child(1) > b')
)
)
total_page = driver.find_element(By.XPATH, '//*[@id="J_bottomPage"]/span[2]/em[1]/b').text
return int(total_page)
except TimeoutError:
search(keyword)
def get_data(html):
selec_data = etree.HTML(html)
lis = selec_data.xpath('//ul[@class="gl-warp clearfix"]/li')
for li in lis:
try:
title = li.xpath('.//div[@class="p-name p-name-type-2"]//em/text()')[0].strip() # 名字
price = li.xpath('.//div[@class="p-price"]//i/text()')[0].strip() # 价格
comment = li.xpath('.//div[@class="p-commit"]//a/text()') # 评论数
shop_name = li.xpath('.//div[@class="p-shop"]//a/text()') # 商铺名字
data_sku = li.xpath('.//div[@class="p-focus "]/a/@data-sku')[0] if li.xpath('.//div[@class="p-focus "]/a/@data-sku') else None # 商品唯一id
icons = li.xpath('.//div[@class="p-icons"]/i/text()') # 备注
comment = comment[0] if comment != [] else ''
shop_name = shop_name[0] if shop_name != [] else ''
icons_n = ''
for x in icons:
icons_n = icons_n + ',' + x
detail_url = li.xpath('.//div[@class="p-name p-name-type-2"]/a/@href')[0] # 详情页网址
detail_url = 'https:' + detail_url
item = [title, price, comment, shop_name, data_sku, icons_n[1:], detail_url]
print(item)
sheet.append(item)
except TimeoutError:
get_data(html)
def main():
url_main = 'https://www.jd.com/'
keyword = input('请输入商品名称:') # 搜索关键词
driver.get(url=url_main)
page = search(keyword)
j = 1
for i in range(3, page*2, 2):
if j == 1:
url = 'https://search.jd.com/Search?keyword={}&page={}&s={}&click=0'.format(keyword, i, j)
else:
url = 'https://search.jd.com/Search?keyword={}&page={}&s={}&click=0'.format(keyword, i, (j-1)*50)
driver.get(url)
time.sleep(1)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)") # 下滑到底部
time.sleep(3)
driver.implicitly_wait(20)
wait.until(
EC.presence_of_all_elements_located((By.XPATH, '//*[@id="J_goodsList"]/ul/li[last()]'))
)
html = driver.page_source
get_data(html)
time.sleep(1)
print(f'正在爬取第{j}页')
j += 1
if j == 20:
break
wb.save('京东{}信息.xlsx'.format(keyword))
print('桐:>爬取成功啦<')
if __name__ == '__main__':
main()