python爬虫案例-乌托家家具公司数据爬取

  这个案例主要是在乌托家网站上爬取家具公司的数据,用的方法是requests模块和xpath语法。代码如下:

 1 # Author:K
 2 import requests
 3 from lxml import etree
 4 import os
 5 
 6 HEADERS = {
 7     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
 8 }
 9 
10 def parse_page(url):
11     response = requests.get(url=url, headers=HEADERS)
12     page_text = response.text
13     tree = etree.HTML(page_text)
14     li_list = tree.xpath('//ul[@class="rec-commodity-ul targetElement"]/li')
15     for li in li_list:
16         merchant_href = li.xpath('.//div[@class="impression"]/a/@href')[0]
17         merchant_name = li.xpath('.//div[@class="impression"]/a/text()')[0]
18         commodity_name = li.xpath('.//div[@class="material"]/a/text()')[0]
19         # print(merchant_href,merchant_name,commodity_name)
20         detail_page_text = requests.get(url=merchant_href, headers=HEADERS).text
21         tree = etree.HTML(detail_page_text)
22         div_infos = tree.xpath('//div[@class="brand-r"]')
23         for div in div_infos:
24             try:
25                 brand_name = div.xpath('./div[4]/dl/dd/text()')[0]
26                 addr = div.xpath('.//p/text()')[0]
27                 phone = div.xpath('.//dd[2]/text()')[0]
28                 # print(brand_name, addr, phone)
29 
30                 # 持久化存储
31                 file_path = 'H:/乌托家/乌托家家具公司.txt'
32                 fp = open(file_path, 'r+', encoding='utf-8')
33                 if brand_name not in fp.read():
34                     if str(addr).__contains__('广东'):
35                         fp.write(brand_name+'   '+addr+'    '+phone+'\n\n')
36                         print(brand_name,'爬取成功!!!')
37                         fp.close()
38             except Exception as e:
39                 print(e)
40 
41 
42 def get_page():
43     for page in range(1,413):
44         url = 'http://www.wutuojia.com/item/list.html?page=' + str(page)
45         parse_page(url)
46 
47 
48 
49 def main():
50     get_page()
51 
52 
53 if __name__ == '__main__':
54     # 持久化存储
55     if not os.path.exists('H:/乌托家'):
56         os.mkdir('H:/乌托家')
57     main()

猜你喜欢

转载自www.cnblogs.com/KisInfinite/p/10952938.html