# _*_ coding: utf-8 _*_ import requests #http请求库 import time import json #json数据库 import os # 文件目录操作库 from lxml import etree,html #dom解析模块 """ 反爬虫的限制 第一次200 经过多次requests库访问 变为404 抓取分页数据思路: 在不知道最后一页的页码是多少的前提下: 1.确定总数 2.确定每页多少页 3.确定总页码=math.ceil(总数/每页显示条数) 120 10 ?12 125 10 ?13 4.总结出每页链接规律 https://bj.fang.anjuke.com/loupan/all/p{页码}/ https://bj.fang.anjuke.com/loupan/all/p1/ https://bj.fang.anjuke.com/loupan/all/p2/ ... https://bj.fang.anjuke.com/loupan/all/pn/ """ urls=( "www.juwai.com/usproperty/r-154 c-3824 i-{}/".format(v) for v in range(1,501) ) #保存函数 def save_html(html,name,path="data/html/hourse"): save_path=os.path.join( os.path.dirname(__file__), path ) if not os.path.exists(save_path): os.mkdir(save_path) with open(os.path.join(save_path,name),"w",encoding="utf-8") as f: f.write(html) #抓取函数 def catch_html(urls,name,path="data/html/hourse"): responses=( requests.request("GET",url) for url in urls ) n=1 while True: try: resp=next(responses) #print(resp.status_code) save_html(resp.text,"{}_{}.html".format(name,n),path) print(resp.status_code) n +=1 time.sleep(0.5) except StopIteration: break #提取数据 def html_to_data(path): all_files=os.listdir(path) all_files_path=map( lambda v: os.path.join(path,v), all_files ) for file in all_files_path: with open(file,"r",encoding="utf-8") as f: html =f.read() selector=etree.HTML(html) if selector is not None: child_dom=selector.xpath('//div[@id="listings-container"]/div') for dom in child_dom[1:]: #title=dom.xpath('div[1]/h4/a/text()') title=dom.xpath('a/@title') image=dom.xpath('a/div[2]/img/@data-original') price=dom.xpath('div[2]/a/span[2]/text()') if not (title and image and price): continue data.append( dict( tilte="".join(map(lambda v:str(v),title)), image="".join(map(lambda v:str(v),image)) ) ) if __name__ == "__main__": urls = ( "www.juwai.com/usproperty/r-154 c-3824 i-{}/".format(v) for v in range(1, 501) ) #catch_html(urls) path=os.path.join( os.path.dirname(__file__), "data/html/hourse" ) html_to_data(path)
python note 爬虫
猜你喜欢
转载自blog.csdn.net/qq_43011640/article/details/89195178
今日推荐
周排行