import jsonfrom selenium import webdriverfrom time import sleepfrom bs4 import BeautifulSoupimport urllib.parse
class weiSpider(object):
def __init__(self,url,shang,start_page,end_page): self.url = url self.shang = shang self.start = start_page self.end = end_page self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"} def handle_url(self,page): data = { "keyword":self.shang, "page":str(page) } url = urllib.parse.urlencode(data) url = self.url +url return url #页面 def handle_page(self,url): driver = webdriver.Chrome() driver.get(url) for i in range(10): js = "document.body.scrollTop += '1000'" driver.execute_script(js) sleep(2) return driver.page_source # 下载 def download(self,res): soup = BeautifulSoup(res,'lxml') wei_list = soup.select('.goods-inner') print(wei_list) items = [] for wei in wei_list: item = {} title = wei.select('.goods-title-info')[0].get_text() Nowprice = wei.select('.goods-sells-price')[0].get_text() ZheKou = wei.select('.good-title-pms')[0].get_text() Oldprice = wei.select('.goods-market-price')[0].get_text() Discount = wei.select('.goods-price-info')[0].get_text() img = wei.select('.goods-image-link img')[0]['src'] item['title'] = title item['Nowprice'] = Nowprice item['ZheKou'] = ZheKou item['Oldprice'] = Oldprice item['Discount'] = Discount item['img'] = img items.append(item) return items #爬取 def weipinspider(self): infos = [] for page in range(self.start,self.end+1): url = self.handle_url(page) res = self.handle_page(url) info = self.download(res) infos += info jsonfile = json.dumps(infos,ensure_ascii=False) with open('vip.json', 'w', encoding='utf-8')as fp: fp.write(jsonfile)
def main():
url = "https://category.vip.com/suggest.php?" shang = input("请输入你要爬取的商品名") start_page = int(input("爬取的起始页")) end_page = int(input("爬取的结束页")) spider = weiSpider(url=url,shang=shang,start_page=start_page,end_page=end_page) spider.weipinspider()
if name == "main":
main()
chrome是谷歌浏览器驱动【有界面的浏览器驱动】选择chrome是注意是否与本机的谷歌浏览器匹配
phantomjs是一个无界面的浏览器【运行较快】但是该产品已经不在更新