爬取描述
爬取某电商上的所有python的关键字信息,这里主要是指图书,每个图书只需要名称和所属的店铺名称信息即可。
实现
这里直接上代码:
from selenium import webdriver
import pandas as pd
from urllib.parse import quote
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time
browser = webdriver.Chrome()
key='python'
url='https://search.jd.com/Search?keyword='+quote(key)+'&enc=utf-8' #构造url
browser.get(url) #打开url
browser.implicitly_wait(3) #等待
wait = WebDriverWait(browser, 10)
input = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "pn-next")))
i = 2
max_page = browser.find_element_by_xpath('//*[@id="J_topPage"]/span/i').text
while True:
if (int(max_page)<i):
print("The current Page Index:" + i + ", it has overflowed with " + max_page)
break
try:
urls = get_page_urls()
browser.execute_script('window.open()')
browser.switch_to_window(browser.window_handles[1])
print("open a book detail window......")
for k in range(0,1):
get_product_info(urls[k])
browser.execute_script('window.close()')
time.sleep(1)
browser.switch_to_window(browser.window_handles[0])
print("close the window, and swithc the active window 0.....")
print("ready to move the next page ==> " + str(i))
browser.refresh()
browser.implicitly_wait(3)
wait = WebDriverWait(browser, 10)
input = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "pn-next")))
browser.find_element_by_xpath('//*[@id="J_bottomPage"]/span[1]/a[9]').click()
time.sleep(3)
wait = WebDriverWait(browser, 10)
input = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "pn-next")))
pageNum = browser.find_element_by_xpath('//*[@id="J_bottomPage"]/span[1]/a[@class="curr"]').text;
print("current page num vs current index:" + str(pageNum) + "/" + str(i))
while str(pageNum) != str(i):
print("ready to check the element in page...")
#browser.find_element_by_xpath('//*[@id="J_topPage"]/a[2]').click()
browser.find_element_by_xpath('//*[@id="J_bottomPage"]/span[1]/a[9]').click()
browser.implicitly_wait(3)
time.sleep(3)
wait = WebDriverWait(browser, 10)
input = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "pn-next")))
print("click again, current page num vs current index:" + str(pageNum) + "/" + str(i))
pageNum = browser.find_element_by_xpath('//*[@id="J_bottomPage"]/span[1]/a[@class="curr"]').text;
print("loading the current page==>" + str(i))
i = i+1
except Exception as e:
print(e)
print("end of the search comment process")
break
browser.close()
def get_page_urls():
wait = WebDriverWait(browser, 10)
input = wait.until(EC.visibility_of_all_elements_located((By.XPATH, '//*[@id="J_goodsList"]/ul/li/div/div[1]/a')))
links=browser.find_elements_by_xpath('//*[@id="J_goodsList"]/ul/li/div/div[1]/a')
print("ready for the current links:" + str(len(links)))
urls=[l.get_attribute('href') for l in links]
print("convert the links into urls:" + str(len(urls)))
return urls
def get_product_info(url):
print("ready to get product info:" + url)
browser.get(url)
content = browser.find_element_by_class_name('sku-name')
product_name = content.text
print('name:' + product_name)
is_case1_matched = True
try:
store_name = browser.find_element_by_xpath('//*[@id="crumb-wrap"]/div/div[2]/div[2]/div[1]/div/a')
print('book name:' + product_name + ', store name:' + store_name.text + "," + store_name.get_attribute('href'))
except NoSuchElementException as e:
print("unable to identify the shop name in case1://*[@id='crumb-wrap']/div/div[2]/div[2]/div[1]/div/a")
is_case1_matched = False
if not is_case1_matched:
try:
store_name = browser.find_element_by_xpath('//*[@id="crumb-wrap"]/div/div[2]/div[1]/em')
print('book name:' + product_name + ', store name:' + store_name.text)
except NoSuchElementException as e:
print(e)
print("the product is JD Owned....")
分析
在开发过程中,碰到了各类不同的问题,所以对于一个简单的爬取应用,才会变得如此复杂。
分析点1:
大量的wait的使用,在程序中加载一个操作很快,但是内容被完全加载出来,在时间上是不确定的,需要等待或者循环检测元素是否需要。
分析点2:
店铺的信息在页面上是不确定的,有几个位置,所以这里采用了试错模式,在NoSuchElementException的时候,使用下一个模式进行提取,保证信息可以提取到。
分析点3:
固定时间延迟, 在提取’下一页’的按钮信息之时,使用time.sleep(3),效果很好,某种意义上说,一般的点击和加载操作,都是需要进行wait操作的,否则有非常高的概率会出错,发生元素不存在、不可见或者已经过期之类的错误提示。
总结
Selenium的功能非常强大了,有了这个利器,没有什么是不能爬取的。天下再也没有不好爬取的数据了。