from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep
import json
from datetime import datetime
import re
class Dpspider(object):
def __init__(self):
# options = Options()
# options.set_headless()
# self.driver = webdriver.Chrome(options=options)
self.driver = webdriver.Chrome()
self.num = 1
self.base_urls = "https://nanjing.newhouse.fang.com/house/s/b9{}/".format(self.num)
def xinfang_list(self):
# 获取所有房源
name = self.driver.find_elements_by_xpath('//*[@class="nl_con clearfix"]/ul/li/div/div[1]/a')
house_lst = []
for i in name:
href = (i.get_attribute('href'))
house_lst.append(href)
data_list = []
for url in house_lst:
self.driver.get(url)
data = {}
# 获取楼盘详情
quyu = self.driver.find_element_by_xpath(
'//div[@class="br_left"]//ul[@class="tf f12"]//li[3]/a').text # 一级区域
data['subarea'] = quyu[:-2] # 字符串切片,去掉后面2个字
data['area'] = self.driver.find_element_by_xpath('//div[@class="s2"]/div/a').text # 当前城市
dingwei = self.driver.find_element_by_xpath('//div[@class="mapbox_dt"]/iframe').get_attribute(
"src") # 获取定位连接
self.driver.get(dingwei)
sound_code = self.driver.page_source # 获取网站的源码
re_search = re.search(r'"mapx":"(.*?)","mapy":"(.*?)"', sound_code, re.DOTALL) # 楼盘坐标..正则匹配"mapx":后面数数字
data['housecoord'] = re_search.group(2) + "," + re_search.group(1)
self.driver.get(url)
try:
fangyuan_url = self.driver.find_element_by_xpath(
"//*[@class='navleft tf']//a[contains(text(),'楼盘详情')]| //*[@class='navleft tf']//a[contains(text(),'详细信息')]")
href1 = fangyuan_url.get_attribute('href')
self.driver.get(href1)
except Exception as e:
pass
# 获取点评所有信息
data['housename'] = self.driver.find_element_by_xpath('//*[@id="daohang"]//h1/a').text # 楼盘名称
try:
housename2 = self.driver.find_element_by_xpath('//*[@id="daohang"]//div/span').text # 楼盘别名
data['housename2'] = housename2[3:] # 字符串切片去掉前面三个字符
except Exception as e:
data['housename2'] = None
data['houseproperty'] = self.driver.find_element_by_xpath('//div[@class="lpicon tf"]').text # 楼盘标签
data['houseproperty'] = data['houseproperty'].replace(" ", ",") # 空格替换逗号
basic_information = self.driver.find_elements_by_xpath('//div[@class="main-left"]')
for i in basic_information:
# 基本信息
data['_id'] = url # 楼盘url
data['source'] = "房天下" # 来源
data['allstatus'] = "1" # 采集状态
price = i.find_element_by_xpath('./div[1]//em').text # 均价
try:
data['houseprice'] = re.search(r"\d+.*", price, re.S).group() # 取出数字及后面的字
except Exception as e:
data['houseprice'] = "待定"
nodes1 = i.find_elements_by_xpath('.//div//h3[contains(text(),"基本信息")]/..//ul/li')
nodes2 = i.find_elements_by_xpath('.//div//h3[contains(text(),"销售信息")]/..//ul/li')
nodes3 = i.find_elements_by_xpath(
'.//div/h3[contains(text(),"楼盘情况")]/../ul/li|//div/h3[contains(text(),"小区规划")]/../ul/li|'
'//div/h3[contains(text(),"配套信息")]/..//ul/li') # 均价
nodes = nodes1 + nodes2 + nodes3
data_dict = {}
for node in nodes:
data_key = node.find_element_by_xpath("./div[1]").text
data_value = node.find_element_by_xpath("./div[2]").text
data_key = data_key.replace(" ", "")
data_value = data_value.replace(" ", ",")
data_value = data_value.replace("\n", ",")
data_dict.update({data_key: data_value})
# 基本信息
if "物业类别:" in data_dict.keys():
data['houseatr'] = data_dict["物业类别:"]
if "建筑类别:" in data_dict.keys():
data['housetype'] = data_dict["建筑类别:"]
elif "写字楼级别:" in data_dict.keys():
data['housetype'] = data_dict["写字楼级别:"]
if "产权年限:" in data_dict.keys():
data['years'] = data_dict["产权年限:"]
if "装修状况:" in data_dict.keys():
data['decoration'] = data_dict["装修状况:"]
if "开发商:" in data_dict.keys():
data['developer'] = data_dict["开发商:"]
if "楼盘地址:" in data_dict.keys():
data['houseaddress'] = data_dict["楼盘地址:"]
# 销售信息
if "销售状态:" in data_dict.keys():
data['salestatus'] = data_dict["销售状态:"]
if "开盘时间:" in data_dict.keys():
data['startSaleString'] = data_dict["开盘时间:"]
if "交房时间:" in data_dict.keys():
data['endSaleString'] = data_dict["交房时间:"]
if "售楼地址:" in data_dict.keys():
data['saleaddress'] = data_dict["售楼地址:"]
# 小区规划
if "占地面积:" in data_dict.keys():
landarea = data_dict["占地面积:"]
data_re = re.findall(r"\d+", landarea, re.S) # 取出数字
data['landarea'] = ("".join(data_re)) # 列表转字符串
if "建筑面积:" in data_dict.keys():
housearea = data_dict["建筑面积:"]
data_re = re.findall(r"[\d\.]+", housearea, re.S) # 取出数字
data['housearea'] = ("".join(data_re)) # 列表转字符串
if "容积率:" in data_dict.keys():
data['plotratio'] = data_dict["容积率:"]
if "绿化率:" in data_dict.keys():
data['greenrate'] = re.sub(r'\%', '', data_dict["绿化率:"]) # 去掉%
if data['greenrate'] == "暂无资料":
data['greenrate'] = None
if "停车位:" in data_dict.keys():
data['carsite'] = data_dict["停车位:"]
elif "停车位配置:" in data_dict.keys():
data['carsite'] = data_dict["停车位配置:"]
if "楼栋总数:" in data_dict.keys():
housecount = data_dict["楼栋总数:"]
data_re = re.findall(r"\d+", housecount, re.S) # 取出数字
data['housecount'] = ("".join(data_re)) # 列表转字符串
elif "楼栋情况:" in data_dict.keys():
data['housecount'] = data_dict["楼栋情况:"]
if "总户数:" in data_dict.keys():
allcount = data_dict["总户数:"]
data_re = re.findall(r"\d+", allcount, re.S) # 取出数字
data['allcount'] = ("".join(data_re)) # 列表转字符串
if "物业公司:" in data_dict.keys():
data['managecompany'] = data_dict["物业公司:"]
if "物业费:" in data_dict.keys():
data['managefee'] = data_dict["物业费:"]
if "楼层状况:" in data_dict.keys():
data['floorCondition'] = data_dict["楼层状况:"]
data['fetch_time'] = str(datetime.now()) # 获取当前时间
self.re_sub_time(data)
for key, value in data.items():
if value and value.endswith(","):
data[key] = value[:-1]
if value and type(value) == str and '[' in value: # 去掉[]内的内容
data[key] = re.sub(r'[^\w]?\[.*?\]', '', value)
data_list.append(data)
return data_list
def re_sub_time(self, data):
pattern = re.compile(r'(\d{4}).*?(\d{1,2}).*?(\d{1,2})') #
pattern_without_day = re.compile(r'(\d{4}).*?(\d{1,2})')
if data["startSaleString"]:
re_serch = pattern.search(data["startSaleString"])
if re_serch:
start_year, start_month, start_day = re_serch.group(1), re_serch.group(2), re_serch.group(3)
start_month, start_day = start_month.rjust(2, '0'), start_day.rjust(2, '0')
data["startsaletime"] = start_year + "-" + start_month + "-" + start_day + " 00:00:00"
else:
try:
re_serch = pattern_without_day.search(data["startSaleString"])
start_year, start_month = re_serch.group(1), re_serch.group(2)
start_month = start_month.rjust(2, '0')
data["startsaletime"] = start_year + "-" + start_month + "-01 00:00:00"
except:
pass
if data["endSaleString"]:
re_serch = pattern.search(data["endSaleString"])
if re_serch:
start_year, start_month, start_day = re_serch.group(1), re_serch.group(2), re_serch.group(3)
start_month, start_day = start_month.rjust(2, '0'), start_day.rjust(2, '0')
data["endsaletime"] = start_year + "-" + start_month + "-" + start_day + " 00:00:00"
else:
try:
re_serch = pattern_without_day.search(data["endSaleString"])
start_year, start_month = re_serch.group(1), re_serch.group(2)
start_month = start_month.rjust(2, '0')
data["endsaletime"] = start_year + "-" + start_month + "-" + "-01 00:00:00"
except:
pass
def save_data(self, data_list):
"""保存本地数据"""
with open('详情(南京).jsonlines', 'a', encoding='utf8') as f:
for data in data_list:
json.dump(data, f, ensure_ascii=False)
f.write('\n')
def sound_data(self):
pass
def __del__(self):
# 退出浏览器
self.driver.quit()
def run(self):
while True:
# get请求浏览网页
self.driver.get(self.base_urls)
# 解析信息
data_list = self.xinfang_list()
# data_list = self.xinfang_list()
# 保存数据
self.save_data(data_list)
self.num += 1
self.base_urls = "https://nanjing.newhouse.fang.com/house/s//b9{}/".format(self.num)
if self.num > 38:
break
if __name__ == '__main__':
GJS = Dpspider()
GJS.run()
Python 使用selenium爬取房天下网站,新房房源详情信息
猜你喜欢
转载自blog.csdn.net/weixin_43407092/article/details/88197734
今日推荐
周排行