爬取电商站点上所有的商品列表信息

爬取描述

爬取某电商上的所有python的关键字信息，这里主要是指图书，每个图书只需要名称和所属的店铺名称信息即可。

实现

这里直接上代码：

from selenium import webdriver
import pandas as pd
from urllib.parse import quote
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time

browser = webdriver.Chrome()
key='python'
url='https://search.jd.com/Search?keyword='+quote(key)+'&enc=utf-8' #构造url
browser.get(url) #打开url
browser.implicitly_wait(3) #等待
wait = WebDriverWait(browser, 10)
input = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "pn-next")))

i = 2
max_page = browser.find_element_by_xpath('//*[@id="J_topPage"]/span/i').text

while True:
    if (int(max_page)<i):
        print("The current Page Index:" + i + ", it has overflowed with " + max_page)
        break

    try:
        urls = get_page_urls()

        browser.execute_script('window.open()')
        browser.switch_to_window(browser.window_handles[1])

        print("open a book detail window......")

        for k in range(0,1):
            get_product_info(urls[k])

        browser.execute_script('window.close()')
        time.sleep(1)
        browser.switch_to_window(browser.window_handles[0])
        print("close the window, and swithc the active window 0.....")

        print("ready to move the next page ==> " + str(i))
        browser.refresh()
        browser.implicitly_wait(3)
        wait = WebDriverWait(browser, 10)
        input = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "pn-next")))

        browser.find_element_by_xpath('//*[@id="J_bottomPage"]/span[1]/a[9]').click()
        time.sleep(3)
        wait = WebDriverWait(browser, 10)
        input = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "pn-next")))

        pageNum = browser.find_element_by_xpath('//*[@id="J_bottomPage"]/span[1]/a[@class="curr"]').text;

        print("current page num vs current index:" + str(pageNum) + "/" + str(i))
        while str(pageNum) != str(i):
            print("ready to check the element in page...")
            #browser.find_element_by_xpath('//*[@id="J_topPage"]/a[2]').click()
            browser.find_element_by_xpath('//*[@id="J_bottomPage"]/span[1]/a[9]').click()

            browser.implicitly_wait(3)
            time.sleep(3)
            wait = WebDriverWait(browser, 10)
            input = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "pn-next")))
            print("click again, current page num vs current index:" + str(pageNum) + "/" + str(i))
            pageNum = browser.find_element_by_xpath('//*[@id="J_bottomPage"]/span[1]/a[@class="curr"]').text;

        print("loading the current page==>" + str(i))
        i = i+1

    except Exception as e:
        print(e)
        print("end of the search comment process")
        break

browser.close()

def get_page_urls():
    wait = WebDriverWait(browser, 10)
    input = wait.until(EC.visibility_of_all_elements_located((By.XPATH, '//*[@id="J_goodsList"]/ul/li/div/div[1]/a')))

    links=browser.find_elements_by_xpath('//*[@id="J_goodsList"]/ul/li/div/div[1]/a')


    print("ready for the current links:" + str(len(links)))

    urls=[l.get_attribute('href') for l in links]
    print("convert the links into urls:" + str(len(urls)))

    return urls

def get_product_info(url):
    print("ready to get product info:" + url)
    browser.get(url)
    content = browser.find_element_by_class_name('sku-name')
    product_name = content.text

    print('name:' + product_name)
    is_case1_matched = True
    try:
        store_name = browser.find_element_by_xpath('//*[@id="crumb-wrap"]/div/div[2]/div[2]/div[1]/div/a')
        print('book name:' + product_name + ', store name:' + store_name.text + "," + store_name.get_attribute('href')) 
    except NoSuchElementException as e:
        print("unable to identify the shop name in case1://*[@id='crumb-wrap']/div/div[2]/div[2]/div[1]/div/a")
        is_case1_matched = False


    if not is_case1_matched:
        try:
            store_name = browser.find_element_by_xpath('//*[@id="crumb-wrap"]/div/div[2]/div[1]/em')
            print('book name:' + product_name + ', store name:' + store_name.text)
        except NoSuchElementException as e:
            print(e)
            print("the product is JD Owned....")

分析

在开发过程中，碰到了各类不同的问题，所以对于一个简单的爬取应用，才会变得如此复杂。
分析点1：
大量的wait的使用，在程序中加载一个操作很快，但是内容被完全加载出来，在时间上是不确定的，需要等待或者循环检测元素是否需要。

分析点2：
店铺的信息在页面上是不确定的，有几个位置，所以这里采用了试错模式，在NoSuchElementException的时候，使用下一个模式进行提取，保证信息可以提取到。

分析点3：
固定时间延迟，在提取’下一页’的按钮信息之时，使用time.sleep（3），效果很好，某种意义上说，一般的点击和加载操作，都是需要进行wait操作的，否则有非常高的概率会出错，发生元素不存在、不可见或者已经过期之类的错误提示。

总结

Selenium的功能非常强大了，有了这个利器，没有什么是不能爬取的。天下再也没有不好爬取的数据了。

爬取电商站点上所有的商品列表信息

爬取描述

实现

分析

总结

猜你喜欢