python note 爬虫

# _*_ coding: utf-8 _*_
import requests #http请求库
import time
import json #json数据库
import os # 文件目录操作库
from lxml import etree,html #dom解析模块


"""
反爬虫的限制
第一次200
经过多次requests库访问
变为404


抓取分页数据思路：
在不知道最后一页的页码是多少的前提下：
1.确定总数
2.确定每页多少页
3.确定总页码=math.ceil(总数/每页显示条数)
120
10
?12
125
10
?13
4.总结出每页链接规律
https://bj.fang.anjuke.com/loupan/all/p{页码}/
https://bj.fang.anjuke.com/loupan/all/p1/
https://bj.fang.anjuke.com/loupan/all/p2/
...
https://bj.fang.anjuke.com/loupan/all/pn/
"""
urls=(
    "www.juwai.com/usproperty/r-154 c-3824 i-{}/".format(v)
    for v in range(1,501)
)
#保存函数
def save_html(html,name,path="data/html/hourse"):
    save_path=os.path.join(
        os.path.dirname(__file__),
        path
    )
    if not os.path.exists(save_path):
        os.mkdir(save_path)
    with open(os.path.join(save_path,name),"w",encoding="utf-8") as f:
        f.write(html)


#抓取函数
def catch_html(urls,name,path="data/html/hourse"):
    responses=(
        requests.request("GET",url)
        for url in urls
    )
    n=1
    while True:
        try:
            resp=next(responses)
            #print(resp.status_code)
            save_html(resp.text,"{}_{}.html".format(name,n),path)
            print(resp.status_code)
            n +=1
            time.sleep(0.5)
        except StopIteration:
            break
#提取数据
def html_to_data(path):
    all_files=os.listdir(path)
    all_files_path=map(
        lambda v: os.path.join(path,v),
        all_files
    )
    for file in all_files_path:
        with open(file,"r",encoding="utf-8") as f:
            html =f.read()
            selector=etree.HTML(html)
            if selector is not None:
                child_dom=selector.xpath('//div[@id="listings-container"]/div')
                for dom in child_dom[1:]:
                    #title=dom.xpath('div[1]/h4/a/text()')
                    title=dom.xpath('a/@title')
                    image=dom.xpath('a/div[2]/img/@data-original')
                    price=dom.xpath('div[2]/a/span[2]/text()')
                    if not (title and image and price):
                        continue
                    data.append(
                        dict(
                            tilte="".join(map(lambda v:str(v),title)),
                            image="".join(map(lambda v:str(v),image))
                        )
                    )


if __name__ == "__main__":
    urls = (
        "www.juwai.com/usproperty/r-154 c-3824 i-{}/".format(v)
        for v in range(1, 501)
    )
    #catch_html(urls)
    path=os.path.join(
        os.path.dirname(__file__),
        "data/html/hourse"
    )
    html_to_data(path)
猜你喜欢