Python 使用selenium爬取房天下网站，新房房源详情信息

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep
import json
from datetime import datetime
import re


class Dpspider(object):
    def __init__(self):
        # options = Options()
        # options.set_headless()
        # self.driver = webdriver.Chrome(options=options)
        self.driver = webdriver.Chrome()
        self.num = 1
        self.base_urls = "https://nanjing.newhouse.fang.com/house/s/b9{}/".format(self.num)	

    def xinfang_list(self):
        # 获取所有房源
        name = self.driver.find_elements_by_xpath('//*[@class="nl_con clearfix"]/ul/li/div/div[1]/a')
        house_lst = []
        for i in name:
            href = (i.get_attribute('href'))
            house_lst.append(href)
        data_list = []
        for url in house_lst:
            self.driver.get(url)
            data = {}
            # 获取楼盘详情
            quyu = self.driver.find_element_by_xpath(
                '//div[@class="br_left"]//ul[@class="tf f12"]//li[3]/a').text  # 一级区域
            data['subarea'] = quyu[:-2]     # 字符串切片，去掉后面2个字
            data['area'] = self.driver.find_element_by_xpath('//div[@class="s2"]/div/a').text  # 当前城市
            dingwei = self.driver.find_element_by_xpath('//div[@class="mapbox_dt"]/iframe').get_attribute(
                "src")  # 获取定位连接
            self.driver.get(dingwei)
            sound_code = self.driver.page_source    # 获取网站的源码
            re_search = re.search(r'"mapx":"(.*?)","mapy":"(.*?)"', sound_code, re.DOTALL)  # 楼盘坐标..正则匹配"mapx":后面数数字
            data['housecoord'] = re_search.group(2) + "," + re_search.group(1)
            self.driver.get(url)
            try:
                fangyuan_url = self.driver.find_element_by_xpath(
                    "//*[@class='navleft tf']//a[contains(text(),'楼盘详情')]| //*[@class='navleft tf']//a[contains(text(),'详细信息')]")
                href1 = fangyuan_url.get_attribute('href')
                self.driver.get(href1)
            except Exception as e:
                pass
            # 获取点评所有信息
            data['housename'] = self.driver.find_element_by_xpath('//*[@id="daohang"]//h1/a').text  # 楼盘名称
            try:
                housename2 = self.driver.find_element_by_xpath('//*[@id="daohang"]//div/span').text  # 楼盘别名
                data['housename2'] = housename2[3:]     # 字符串切片去掉前面三个字符
            except Exception as e:
                data['housename2'] = None
            data['houseproperty'] = self.driver.find_element_by_xpath('//div[@class="lpicon tf"]').text  # 楼盘标签
            data['houseproperty'] = data['houseproperty'].replace(" ", ",")  # 空格替换逗号
            basic_information = self.driver.find_elements_by_xpath('//div[@class="main-left"]')
            for i in basic_information:
                # 基本信息
                data['_id'] = url  # 楼盘url
                data['source'] = "房天下"  # 来源
                data['allstatus'] = "1"  # 采集状态
                price = i.find_element_by_xpath('./div[1]//em').text  # 均价
                try:
                    data['houseprice'] = re.search(r"\d+.*", price, re.S).group()   # 取出数字及后面的字
                except Exception as e:
                    data['houseprice'] = "待定"
                nodes1 = i.find_elements_by_xpath('.//div//h3[contains(text(),"基本信息")]/..//ul/li')
                nodes2 = i.find_elements_by_xpath('.//div//h3[contains(text(),"销售信息")]/..//ul/li')
                nodes3 = i.find_elements_by_xpath(
                    './/div/h3[contains(text(),"楼盘情况")]/../ul/li|//div/h3[contains(text(),"小区规划")]/../ul/li|'
                    '//div/h3[contains(text(),"配套信息")]/..//ul/li')  # 均价
                nodes = nodes1 + nodes2 + nodes3
                data_dict = {}
                for node in nodes:
                    data_key = node.find_element_by_xpath("./div[1]").text
                    data_value = node.find_element_by_xpath("./div[2]").text
                    data_key = data_key.replace(" ", "")
                    data_value = data_value.replace(" ", ",")
                    data_value = data_value.replace("\n", ",")
                    data_dict.update({data_key: data_value})
                # 基本信息
                if "物业类别：" in data_dict.keys():
                    data['houseatr'] = data_dict["物业类别："]
                if "建筑类别：" in data_dict.keys():
                    data['housetype'] = data_dict["建筑类别："]
                elif "写字楼级别：" in data_dict.keys():
                    data['housetype'] = data_dict["写字楼级别："]
                if "产权年限：" in data_dict.keys():
                    data['years'] = data_dict["产权年限："]
                if "装修状况：" in data_dict.keys():
                    data['decoration'] = data_dict["装修状况："]
                if "开发商：" in data_dict.keys():
                    data['developer'] = data_dict["开发商："]
                if "楼盘地址：" in data_dict.keys():
                    data['houseaddress'] = data_dict["楼盘地址："]
                # 销售信息
                if "销售状态：" in data_dict.keys():
                    data['salestatus'] = data_dict["销售状态："]
                if "开盘时间：" in data_dict.keys():
                    data['startSaleString'] = data_dict["开盘时间："]
                if "交房时间：" in data_dict.keys():
                    data['endSaleString'] = data_dict["交房时间："]
                if "售楼地址：" in data_dict.keys():
                    data['saleaddress'] = data_dict["售楼地址："]
                # 小区规划
                if "占地面积：" in data_dict.keys():
                    landarea = data_dict["占地面积："]
                    data_re = re.findall(r"\d+", landarea, re.S)    # 取出数字
                    data['landarea'] = ("".join(data_re))   # 列表转字符串
                if "建筑面积：" in data_dict.keys():
                    housearea = data_dict["建筑面积："]
                    data_re = re.findall(r"[\d\.]+", housearea, re.S)   # 取出数字
                    data['housearea'] = ("".join(data_re))  # 列表转字符串
                if "容积率：" in data_dict.keys():
                    data['plotratio'] = data_dict["容积率："]
                if "绿化率：" in data_dict.keys():
                    data['greenrate'] = re.sub(r'\%', '', data_dict["绿化率："])    # 去掉%
                    if data['greenrate'] == "暂无资料":
                        data['greenrate'] = None
                if "停车位：" in data_dict.keys():
                    data['carsite'] = data_dict["停车位："]
                elif "停车位配置：" in data_dict.keys():
                    data['carsite'] = data_dict["停车位配置："]
                if "楼栋总数：" in data_dict.keys():
                    housecount = data_dict["楼栋总数："]
                    data_re = re.findall(r"\d+", housecount, re.S)  # 取出数字
                    data['housecount'] = ("".join(data_re))     # 列表转字符串
                elif "楼栋情况：" in data_dict.keys():
                    data['housecount'] = data_dict["楼栋情况："]
                if "总户数：" in data_dict.keys():
                    allcount = data_dict["总户数："]
                    data_re = re.findall(r"\d+", allcount, re.S)    # 取出数字
                    data['allcount'] = ("".join(data_re))   # 列表转字符串
                if "物业公司：" in data_dict.keys():
                    data['managecompany'] = data_dict["物业公司："]
                if "物业费：" in data_dict.keys():
                    data['managefee'] = data_dict["物业费："]
                if "楼层状况：" in data_dict.keys():
                    data['floorCondition'] = data_dict["楼层状况："]
                data['fetch_time'] = str(datetime.now())  # 获取当前时间
                self.re_sub_time(data)
                for key, value in data.items():
                    if value and value.endswith(","):
                        data[key] = value[:-1]
                    if value and type(value) == str and '[' in value:  # 去掉[]内的内容
                        data[key] = re.sub(r'[^\w]?\[.*?\]', '', value)
            data_list.append(data)
        return data_list
    def re_sub_time(self, data):
        pattern = re.compile(r'(\d{4}).*?(\d{1,2}).*?(\d{1,2})')    #
        pattern_without_day = re.compile(r'(\d{4}).*?(\d{1,2})')
        if data["startSaleString"]:
            re_serch = pattern.search(data["startSaleString"])
            if re_serch:
                start_year, start_month, start_day = re_serch.group(1), re_serch.group(2), re_serch.group(3)
                start_month, start_day = start_month.rjust(2, '0'), start_day.rjust(2, '0')
                data["startsaletime"] = start_year + "-" + start_month + "-" + start_day + " 00:00:00"
            else:
                try:
                    re_serch = pattern_without_day.search(data["startSaleString"])
                    start_year, start_month = re_serch.group(1), re_serch.group(2)
                    start_month = start_month.rjust(2, '0')
                    data["startsaletime"] = start_year + "-" + start_month + "-01 00:00:00"
                except:
                    pass
        if data["endSaleString"]:
            re_serch = pattern.search(data["endSaleString"])
            if re_serch:
                start_year, start_month, start_day = re_serch.group(1), re_serch.group(2), re_serch.group(3)
                start_month, start_day = start_month.rjust(2, '0'), start_day.rjust(2, '0')
                data["endsaletime"] = start_year + "-" + start_month + "-" + start_day + " 00:00:00"
            else:
                try:
                    re_serch = pattern_without_day.search(data["endSaleString"])
                    start_year, start_month = re_serch.group(1), re_serch.group(2)
                    start_month = start_month.rjust(2, '0')
                    data["endsaletime"] = start_year + "-" + start_month + "-" + "-01 00:00:00"
                except:
                    pass
    def save_data(self, data_list):
        """保存本地数据"""
        with open('详情(南京).jsonlines', 'a', encoding='utf8') as f:
            for data in data_list:
                json.dump(data, f, ensure_ascii=False)
                f.write('\n')
    def sound_data(self):
        pass
    def __del__(self):
        # 退出浏览器
        self.driver.quit()
    def run(self):
        while True:
            # get请求浏览网页
            self.driver.get(self.base_urls)
            # 解析信息
            data_list = self.xinfang_list()
            # data_list = self.xinfang_list()
            #   保存数据
            self.save_data(data_list)
            self.num += 1
            self.base_urls = "https://nanjing.newhouse.fang.com/house/s//b9{}/".format(self.num)
            if self.num > 38:
                break
if __name__ == '__main__':
    GJS = Dpspider()
    GJS.run()
Python 使用selenium爬取房天下网站，新房房源详情信息

猜你喜欢