用requests爬取要注意HTTPConnectionPool(host=xxx, port=xxx): Max retries exceeded with url...异常,出现这个异常的解决方法:
①在requests.get()方法前调用 disable_warnings()方法 → requests.packages.urllib3.disable_warnings()
②在reqeusts.get() 方法中设置verify = False。→ response = requests.get(url = url,headers = self.__class__.headers,verify = False)
③在requests.get()方法后设置重试次数。→ requests.adapters.DEFAULT_RETRIES = 5
详细的操作请看代码:
1 # Author:K 2 import requests 3 from lxml import etree 4 import os 5 import csv 6 7 class CompanySpider(object): 8 headers = { 9 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36', 10 'Connection': 'close' 11 } 12 13 def run(self): 14 self.get_urls() 15 16 def get_urls(self): 17 detail_urls = [] 18 for page in range(1,46): 19 url = 'http://gd.taoci163.com/qiye/a440600p%s/FoShanShi.html' % page 20 requests.packages.urllib3.disable_warnings() 21 response = requests.get(url = url,headers = self.__class__.headers,verify = False) 22 requests.adapters.DEFAULT_RETRIES = 5 23 tree = etree.HTML(response.text) 24 detail_urls = tree.xpath('//div[@class="mainLeft"]//li//span/a[1]/@href') 25 self.parse_page(detail_urls) 26 print('page %s over!!!' % page) 27 28 def parse_page(self,urls): 29 for url in urls: 30 data = [] 31 requests.packages.urllib3.disable_warnings() 32 response = requests.get(url = url,headers = self.__class__.headers,verify = False) 33 requests.adapters.DEFAULT_RETRIES = 5 34 tree = etree.HTML(response.text) 35 # 获取公司名称,添加到列表中 36 try: 37 company_name = tree.xpath('//div[@class="conA contact"]/ul/li[1]/span/text()')[0] 38 data.append(company_name) 39 # 获取公司联系人,添加到列表中 40 contacts_name = tree.xpath('//div[@class="conA contact"]/ul/li[2]/span/text()')[0] 41 data.append(contacts_name) 42 # 获取公司地址,添加到列表中 43 company_addr = tree.xpath('//div[@class="conA contact"]/ul/li[3]/span/text()')[0] 44 data.append(company_addr) 45 # 获取公司电话,添加到列表中 46 company_phone = tree.xpath('//div[@class="conA contact"]/ul/li[5]/span/text()')[0] 47 data.append(company_phone) 48 # 获取手机号,添加到列表中 49 mobile_phone = tree.xpath('//div[@class="conA contact"]/ul/li[6]/span/text()')[0] 50 data.append(mobile_phone) 51 # 获取公司传真,添加到列表中 52 company_fax = tree.xpath('//div[@class="conA contact"]/ul/li[7]/span/text()')[0] 53 data.append(company_fax) 54 # 持久化存储 55 self.save_data(data) 56 except Exception as e: 57 print(e) 58 59 def save_data(self,data): 60 writer.writerow(data) 61 62 63 if __name__ == '__main__': 64 if not os.path.exists('H:/陶瓷公司数据'): 65 os.mkdir('H:/陶瓷公司数据') 66 fp = open('H:/陶瓷公司数据/佛山陶瓷公司_test.csv','wt',encoding = 'utf-8-sig') 67 writer = csv.writer(fp) 68 csv_header = ['公司名称','联系人','公司地址','电话','手机','公司传真'] 69 writer.writerow(csv_header) 70 spider = CompanySpider() 71 try: 72 spider.run() 73 except Exception as e: 74 print(e) 75 fp.close()
而用scrapy框架的话就方便很多,由于采用异步方式,爬取速度也很快,要注意域名必须写对,很重要!代码如下:
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from ..items import ChinaItem 4 5 6 class ChinaCompanySpider(scrapy.Spider): 7 name = 'china_company' 8 allowed_domains = ['taoci163.com'] # !!!!!!!!!此处域名要写对,不小心掉坑里了!!!!!!!!! 9 start_urls = ['http://gd.taoci163.com/qiye/a440600p1/FoShanShi.html'] 10 11 def parse(self, response): 12 detail_urls = response.xpath('//div[@class="mainLeft"]//li//span/a[1]/@href').getall() 13 for detail_url in detail_urls: 14 yield scrapy.Request(detail_url,callback = self.parse_detail) 15 next_url = response.xpath('//div[@class="page"]/a[last()]/@href').get() 16 if next_url: 17 yield scrapy.Request(response.urljoin(next_url),callback = self.parse) 18 19 def parse_detail(self,response): 20 company_name = response.xpath('//div[@class="conA contact"]/ul/li[1]/span/text()').get() 21 # 获取公司联系人,添加到列表中 22 contacts_name = response.xpath('//div[@class="conA contact"]/ul/li[2]/span/text()').get() 23 # 获取公司地址,添加到列表中 24 company_addr = response.xpath('//div[@class="conA contact"]/ul/li[3]/span/text()').get() 25 # 获取公司电话,添加到列表中 26 company_phone = response.xpath('//div[@class="conA contact"]/ul/li[5]/span/text()').get() 27 # 获取手机号,添加到列表中 28 mobile_phone = response.xpath('//div[@class="conA contact"]/ul/li[6]/span/text()').get() 29 # 获取公司传真,添加到列表中 30 company_fax = response.xpath('//div[@class="conA contact"]/ul/li[7]/span/text()').get() 31 # 测试 32 item = ChinaItem(company_name = company_name,contacts_name = contacts_name, 33 company_addr = company_addr,company_phone = company_phone, 34 mobile_phone = mobile_phone,company_fax = company_fax) 35 yield item 36
1 # -*- coding: utf-8 -*- 2 3 # Define your item pipelines here 4 # 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 8 import csv 9 10 11 class ChinaPipeline(object): 12 def __init__(self): 13 self.fp = open('H:/陶瓷公司数据/佛山陶瓷公司(scrapy).csv','a+',encoding = 'utf-8-sig') 14 self.writer = csv.writer(self.fp) 15 headers = ['公司名称','联系人','公司地址','电话','手机','公司传真'] 16 self.writer.writerow(headers) 17 18 def process_item(self, item, spider): 19 company_name = item['company_name'] 20 contacts_name = item['contacts_name'] 21 company_addr = item['company_addr'] 22 company_phone = item['company_phone'] 23 mobile_phone = item['mobile_phone'] 24 company_fax = item['company_fax'] 25 26 self.writer.writerow((company_name,contacts_name,company_addr,company_phone,mobile_phone,company_fax)) 27 return item 28 29 def close_spider(self,spider): 30 self.fp.close()