scrapy 实现代码,代码有点多,没有优化,,下面有链接,不懂得留言
- Github全部代码,https://github.com/Agile929/scrapy_fang
# -*- coding: utf-8 -*-
import scrapy, json
import math
from lxml import etree
from ..items import DataFangItem, DataDynamicJson, DataCommentJson, DataPicJson, \
DataHouseapartment, ImageItem, DataPresale
import re, io
from bs4 import BeautifulSoup
from datetime import datetime
from time import sleep
class TianxiaSpider(scrapy.Spider):
name = "tianxia1"
allowed_domains = ["fang.com"]
city_link_list = io.open(r"/home/kevin/work/data_fang/data_fang/spiders/城市url", "r", encoding="gbk")
# city_link_list = city_link_list.readline()
for link in city_link_list:
start_urls = link.replace("\n", "")
start_urls = [start_urls]
def __init__(self):
super(TianxiaSpider, self).__init__()
self.dynamic_urls = []
self.dynamicJson = []
self.house_list = []
def parse(self, response):
# 解析每个城市
new_house = response.xpath(
"//div[@class='newnav20141104nr']//div/a[contains(text(),'新房')]/@href").extract()[0] # 获取新房url
new_house = re.sub(r"\?\w+\=\w+", "", new_house) # 去掉?后面的字符
yield scrapy.Request(new_house, callback=self.parse_all_house)
def parse_all_house(self, response):
# 解析所有房源
url = response.url
all_house = response.xpath("//*[@class='clearfix']/div/a/@href ").extract() # 获取当前页面所有的房源url
for one_house in all_house:
house = u"https:" + one_house
house = re.sub(r"\?\w+=\w+_\w+", "", house) # 去掉?后面的字符
self.house_list.append(house)
# The_next_page = response.xpath(
# '//li[@class="floatr rankWrap"]/div/a[contains(text(),">")]/@href').extract_first() # 获取下一页
# if The_next_page != None:
# The_next_page = url + The_next_page
# yield scrapy.Request(The_next_page, callback=self.parse_all_house)
the_next_page = response.xpath(
'//li[@class="floatr rankWrap"]/div/a[contains(text(),">")]/@href').extract_first() # 获取下一页
if the_next_page is None:
url = self.house_list.pop()
yield scrapy.Request(url, callback=self.home_page)
else:
the_next_page_url = url + the_next_page
yield scrapy.Request(the_next_page_url, callback=self.parse_all_house)
def home_page(self, response):
# 解析首页获取详情页
item = DataFangItem()
item['_id'] = response.url
item['subarea'] = response.xpath('//div[@class="br_left"]//ul[@class="tf f12"]//li[3]/a/text()').extract()
item['subarea'] = "".join(item['subarea']) # 字符串切片,去掉后面2个字
item['subarea'] = item['subarea'][:-2]
item['area'] = response.xpath('//div[@class="s2"]/div/a/text()').extract() # 当前城市
item['area'] = "".join(item['area']).replace(",", "")
positioning = response.xpath("//div[@class='mapbox_dt']/iframe/@src").extract_first() # 获取楼盘定位地址
positioning = u"https:" + positioning
particulars = response.xpath("//*[@class='navleft tf']//a[contains(text(),'详情')]/@href|"
"//*[@class='navleft tf']//a[contains(text(),'详细')]/@href").extract() # 楼盘详情
particulars = "".join(particulars)
particulars = u"https:" + particulars
# yield scrapy.Request(positioning, meta={"item": item, "xiangqing": particulars}, callback=self.positioning) # 爬取详情打开注释
try:
dynamic = response.xpath("//*[@class='navleft tf']//a[contains(text(),'动态')]/@href").extract() # 楼盘动态
dynamic = "".join(dynamic)
dynamic = u"https:" + dynamic
yield scrapy.Request(dynamic, callback=self.parse_dynamic)
except Exception as e:
pass
url = self.house_list.pop()
yield scrapy.Request(url, callback=self.home_page)
"""爬取点评打开注释"""
# try:
# comments = response.xpath(
# "//*[@class='navleft tf']//a[contains(text(),'点评')]/@href").extract_first() # 楼盘点评
# comments = "".join(comments)
# comments = u"https:" + comments
# yield scrapy.Request(comments, callback=self.parse_comments)
# except Exception as e:
# pass
"""爬取户型打开注释"""
# try:
# houseapartment = response.xpath(
# "//*[@class='navleft tf']//a[contains(text(),'户型')]/@href").extract_first() # 楼盘户型
# # houseapartment = "".join(houseapartment)
# houseapartment = u"https:" + houseapartment
# yield scrapy.Request(houseapartment, callback=self.parse_houseapartment)
# except Exception as e:
# pass
"""爬取相册打开注释"""
# try:
# houseImage = response.xpath(
# "//*[@class='navleft tf']//a[contains(text(),'相册')]/@href").extract_first() # 楼盘相册
# if not houseImage:
# yield {"_id": item['_id'], "houseImage": json.dumps([])}
# houseImage = u"https:" + houseImage
# yield scrapy.Request(houseImage, meta={"_id": response.url}, callback=self.parse_image_base)
# except Exception as e:
# pass
def parse_image_base(self, response):
"""
使用正则的方法匹配url,并且直接使用ajax请求
:return:
"""
html = response.text
_id = response.meta["_id"]
building_name = re.search(r"//(\w+)\.", response.url).group(1)
building_number = re.search(r"(\d+)\.htm", response.url).group(1)
module_dict = {}
image_list = []
re_effect_image = re.compile(r"\<a\W.*?\<span\>效果图\<\/span\>.*?\<\/a\>")
re_realsight_image = re.compile(r"\<a\W.*?\<span\>实景图\<\/span\>.*?\<\/a\>")
re_traffic_image = re.compile(r"\<a\W.*?\<span\>交通图\<\/span\>.*?\<\/a\>")
re_prototype_room = re.compile(r"\<a\W.*?\<span\>样板间\<\/span\>.*?\<\/a\>")
effect_image = re_effect_image.findall(html)
realsight_image = re_realsight_image.findall(html)
traffic_image = re_traffic_image.findall(html)
prototype_room = re_prototype_room.findall(html)
pattern_url = re.compile(r"//.*?htm")
pattern_num = re.compile(r"<em>(\d+)<\/em>")
# use interface to get data directly
effect_number, realsight_number, traffic_number, prototype_number = (0, 0, 0, 0)
if effect_image:
try:
effect_image_url = "http:" + pattern_url.findall(effect_image[0])[0]
effect_number = int(pattern_num.search(effect_image[0]).group(1))
module_dict.update({"xiaoguotu": [effect_number, effect_image_url, 904]})
except Exception as e:
print(str(e))
effect_number = 0
if realsight_image:
try:
realsight_image_url = "http:" + pattern_url.findall(realsight_image[0])[0]
realsight_number = int(pattern_num.search(realsight_image[0]).group(1))
module_dict.update({"shijingtu": [realsight_number, realsight_image_url, 903]})
except Exception as e:
print(str(e))
realsight_number = 0
if traffic_image:
try:
traffic_image_url = "http:" + pattern_url.findall(traffic_image[0])[0]
traffic_number = int(pattern_num.search(traffic_image[0]).group(1))
module_dict.update({"jiaotongtu": [traffic_number, traffic_image_url, 901]})
except Exception as e:
print(str(e))
traffic_number = 0
if prototype_room:
try:
prototype_room_url = "http:" + pattern_url.findall(prototype_room[0])[0]
prototype_number = int(pattern_num.search(prototype_room[0]).group(1))
module_dict.update({"yangbanjian": [prototype_number, prototype_room_url, 905]})
except Exception as e:
print(str(e))
prototype_number = 0
# 判断如果各种图全部没有,则直接返回
if effect_number + realsight_number + traffic_number + prototype_number == 0:
print("一张图片都没有")
yield {"_id": _id, "picJson": json.dumps("", ensure_ascii=False)}
else:
full_image_interface_list = []
for key, value in module_dict.items():
base_url = "http://" + building_name + ".fang.com/house/ajaxrequest/photolist_get.php?newcode=" + building_number + "&type=" + str(
value[2]) + "&room=&nextpage="
page_number = value[0] // 6 + 2 if value[0] % 6 else value[0] // 6 + 1
[full_image_interface_list.append([base_url + str(i), key]) for i in range(1, page_number)]
first_url = full_image_interface_list.pop()
meta = {"_id": _id, "request_list": full_image_interface_list, "type": first_url[1], "json_data": []},
yield scrapy.Request(first_url[0], meta={"item": meta}, callback=self.parse_images)
def parse_images(self, response):
"""
:param total_number: total number of images of effect
:param style_number: total number of images of effect
:return:
"""
item = dict(response.meta["item"][0])
TuUrl = ImageItem()
data_list = item["json_data"]
image_type = item["type"]
_id = item["_id"]
full_image_interface_list = item["request_list"]
if full_image_interface_list:
data = json.loads(response.body)
# print("相册借口", data)
[data_list.append({"picUrl": "http:" + re.sub(r"\d+x\d+\.", "880x600.", i["url_s"]), "type": image_type})
for i in data]
first_url = full_image_interface_list.pop()
meta = {"_id": _id, "request_list": full_image_interface_list, "type": first_url[1],
"json_data": data_list},
yield scrapy.Request(first_url[0], meta={"item": meta}, callback=self.parse_images)
else:
yield {"_id": _id, "picJson": json.dumps(data_list)}
data = json.loads(response.text)
for images in data:
TuUrl["image_urls"] = ["http:" + re.sub(r"\d+x\d+\.", "880x600.", images["url_s"])]
yield TuUrl
def parse_houseapartment(self, response):
# 解析户型页面 通过拼接 接口获取数据
data_url = response.url
building_name = re.sub(r"\w+\/\w+_\d+_\d+.htm", "", data_url)
building_number = re.search(r"(\d+)\.htm", data_url).group(1)
jiekou_url = building_name + "house/ajaxrequest/householdlist_get.php?newcode=" + building_number + "&room=all"
yield scrapy.Request(jiekou_url, meta={"house_url": building_name}, callback=self.house_interface)
def house_interface(self, response):
# 户型接口
item = DataHouseapartment()
house_url = response.meta["house_url"]
all_comment_dict = {"_id": house_url}
houseapartment = []
datas = json.loads(response.text)
for data in datas:
# item["houseUrl"]
images = []
imag = "http:" + re.sub("220x150", "748x600", data["houseimageurl"])
images.append({"picUrl": imag})
item["imgs"] = images # 户型名称
# item["_id"] = house_url # 户型url
item["name"] = data["housetitle"] # 户型名称
item["houseUrl"] = house_url + "photo/d_house_" + data["picID"] + ".htm"
item["salesStatus"] = data["status"] # 在售状态
item["roomNum"] = data["room"] # 户型(房)
item["hallNum"] = data["hall"] # 户型(厅)
item["toiletNum"] = data["toilet"] # 户型(卫)
item["constructSpace"] = data["buildingarea"]
# item["price"] = data["toilet"]
# item["propertyType"] = data["toilet"]
# item["remark"] = data["toilet"]
try:
if "-" in data["reference_price"]:
lower_price, high_price = data["reference_price"].split("-")
data["reference_price"] = str((float(lower_price) + float(high_price)) / 2)
except Exception as e:
print(str(e))
try:
item["price"] = int(float(data["reference_price"]) / float(data["buildingarea"]) * 10000) if \
data["reference_price"] != "待定" and data["buildingarea"] != "待定" \
and data["reference_price"] and data["buildingarea"] \
and float(data["reference_price"]) and \
float(data["buildingarea"]) else None # 参考均价
except Exception as e:
print(str(e))
if not data["reference_price"]:
item["totalPrices"] = ""
elif data["reference_price"] == "待定":
item["totalPrices"] = data["reference_price"]
else:
item["totalPrices"] = data["reference_price"] + "万元/套"
houseapartment.append(dict(item))
houseapartment = json.dumps(houseapartment, ensure_ascii=False)
all_comment_dict.update({"houseapartment": houseapartment})
yield all_comment_dict
def parse_comments(self, response):
# 解析评论
url = response.url
house = re.sub(r"dianping/", "", url)
particulars = response.xpath("//*[@class='navleft tf']//a[contains(text(),'详情')]/@href|"
"//*[@class='navleft tf']//a[contains(text(),'详细')]/@href").extract_first()
particulars = u"https:" + particulars
parameter = re.search(r"/(\d+)/", particulars).group(1)
comments_data = response.xpath("//*[@id='dpCount']/text()").extract_first()
comments_data = re.search(r"(\d+)", comments_data).group(1)
comments_data = int(comments_data)
port_url = house + "house/ajaxrequest/dianpingList_201501.php" # pc端接口
port = {
"dianpingNewcode": str(parameter),
"ifjiajing": "0",
# "page": "1",
"tid": "null",
"pagesize": str(comments_data),
"starnum": "6",
"shtag": "-1",
}
yield scrapy.FormRequest(url=port_url, method="POST", formdata=port, callback=self.comment_port) # 发送post请求
def comment_port(self, response):
item = DataCommentJson()
url = response.url
url = re.sub(r"house\/\w+\/\w+_\d+.php", "", url)
all_comment_dict = {"_id": url}
commentJson = []
# datas = json.loads(response.body)["list"]
datas = json.loads(response.text)["list"]
# datas = json.loads(response.body.decode("gb18030"))
for data in datas:
item["source"] = "房天下"
item["userNick"] = data["nickname"]
if item["userNick"] == "":
item["userNick"] = data["username"]
item["content"] = data["content"]
item["sourceUrl"] = url + "dianping/"
item["createDate"] = data["create_time"]
commentJson.append(dict(item))
# commentJson = json.dumps(commentJson, ensure_ascii=False)
all_comment_dict.update({"commentJson": commentJson})
yield all_comment_dict
def parse_dynamic(self, response):
# 解析动态
# dynamic_urls = []
dynamic = response.xpath("//*[@class='navleft tf']//a[contains(text(),'首页')]/@href").extract() # 楼盘首页
dynamic = "".join(dynamic)
_id = u"https:" + dynamic
try:
dynamic_url = response.xpath(
'//div[@id="gushi_all"]//a[contains(text(),"详情")]/@href').extract() # 获取动态里详情链接
# dynamic_url = "".join(dynamic_url)
if dynamic_url != None:
for one_dynameic_url in dynamic_url:
one_dynameic_url = u"https:" + one_dynameic_url
# yield scrapy.Request(one_dynameic_url, callback=self.dynamic_particulars)
self.dynamic_urls.append(one_dynameic_url)
the_next_page = response.xpath(
'//div[@id="gushi_all"]//li[@class="clearfix dbib"]//a[contains(text(),"下一页")]/@href').extract_first() # 下一页
if the_next_page is None:
url = self.dynamic_urls.pop()
yield scrapy.Request(url, callback=self.dynamic_particulars)
else:
the_next_page = _id + the_next_page
yield scrapy.Request(the_next_page, callback=self.parse_dynamic)
except Exception as e:
pass
# the_next_page = response.xpath('//div[@id="gushi_all"]//li[@class="clearfix dbib"]//a[contains(text(),"下一页")]/@href').extract_first() # 下一页
# print("0033",the_next_page)
# if the_next_page != None:
# the_next_page = _id + the_next_page
# yield scrapy.Request(the_next_page,callback=self.parse_dynamic)
def dynamic_particulars(self, response):
item = DataDynamicJson()
dynamic = response.xpath("//*[@class='navleft tf']//a[contains(text(),'首页')]/@href").extract() # 楼盘首页
dynamic = "".join(dynamic)
_id = u"https:" + dynamic
all_comment_dict = {"_id": _id}
# dynamicJson = []
url = response.url
url = re.sub(r"\d+_\d+\.htm", "", url)
dynamic_content = response.xpath("//div[@class='atc-wrapper']")
for i in dynamic_content:
item["soutse"] = "房天下"
item["title"] = i.xpath("./h1/text()").extract_first()
item["publishDate"] = i.xpath("./h2/text()[3]").extract_first()
item['publishDate'] = re.search(r"\d+.*", item["publishDate"], re.S).group() # 时间
# item["publishDate"] = item["publishDate"].replace(" ", "")
item["publishDate"] = item["publishDate"].replace("\n", "")
item["publishDate"] = item["publishDate"].replace("\t", "")
item["publishDate"] = item["publishDate"].replace("\r", "")
# time = "".join(time)
# data["publishDate"] =re.search(r"/d+.*",time,re.S).group()
item["content"] = i.xpath(
".//p[@style='text-indent:2em;']//text()|//div[@class='leftboxcom']//text()").extract()
item["content"] = "".join(item["content"])
item["content"] = item["content"].replace(" ", "")
item["content"] = item["content"].replace("\n", "")
item["content"] = item["content"].replace("\t", "")
item["content"] = item["content"].replace("\r", "")
self.dynamicJson.append(dict(item))
the_next_page1 = response.xpath('//div[@class="fy-wrapper"]/a[@class="xyp"]/@href').extract_first()
the_next_page = url + the_next_page1
if self.dynamic_urls == []:
# if the_next_page1 == "javascript:void(0);":
all_comment_dict.update({"dynamicJson": self.dynamicJson})
yield all_comment_dict
self.dynamicJson.clear()
else:
url = self.dynamic_urls.pop()
yield scrapy.Request(url, callback=self.dynamic_particulars)
def positioning(self, response):
item = response.meta["item"]
particulars = response.meta["xiangqing"]
ditu = response.body.decode("utf8")
# re_search = re.search(r'"mapx":"(.*?)","mapy":"(.*?)"', ditu, re.DOTALL)
re_search = re.search(r'"mapx":"(\d+\.\d+)","mapy":"(\d+\.\d+)"', ditu, re.DOTALL)
housecoord = re_search.group(2) + "," + re_search.group(1)
item["housecoord"] = housecoord
yield scrapy.Request(particulars, meta={"item": item}, callback=self.parse_particulars)
def parse_particulars(self, response):
# 解析详情页
url = re.sub(r"house/\d+/\w+.htm", "", response.url)
pattern = re.compile(r'\W+', re.S)
html = response.body.decode("gb18030")
soup = BeautifulSoup(html, "html.parser")
html = etree.HTML(html)
item = response.meta['item']
item['housename'] = response.xpath('//*[@id="daohang"]//h1/a/text()').extract() # 楼盘名称
item['housename'] = "".join(item['housename'])
try:
housename2 = response.xpath('//*[@id="daohang"]//div/span/text()').extract() # 楼盘别名
housename2 = "".join(housename2)
item['housename2'] = housename2[3:] # 字符串切片去掉前面三个字符
if not item['housename2']:
item['housename2'] = ""
except Exception as e:
item['housename2'] = None
houseproperty = response.xpath('//div[@class="lpicon tf"]//text()').extract() # 楼盘标签
houseproperty = [pattern.sub('', i) for i in houseproperty]
re_houseproperty = []
[re_houseproperty.append(i)
for i in houseproperty if i]
houseproperty = ",".join(re_houseproperty) # 空格替换逗号
houseproperty = "".join(houseproperty)
item["houseproperty"] = houseproperty
# ---------------------------预售证------------------------
try:
basic_information = response.xpath(
"//div//h3[contains(text(),'销售信息')]/..//div[@class='table-all']//tr[position()>1]")
if basic_information == []:
basic_information = response.xpath(
"//div//h3[contains(text(),'销售信息')]/..//div[@class='table-part']//tr[position()>1]")
except Exception as e:
basic_information = None
pass
all_comment_dict = {"_id": url}
presale = []
for i in basic_information:
# 基本信息
# data_lists = []
data = {}
# data = DataPresale()
budgetLicence = i.xpath(".//td[1]/text()").extract()
data['budgetLicence'] = "".join(budgetLicence)
licenceDate = i.xpath(".//td[2]/text()").extract()
data["licenceDate"] = "".join(licenceDate) # 获取时间
pattern = re.compile(r'(\d{4}).*?(\d{1,2}).*?(\d{1,2})')
pattern_without_day = re.compile(r'(\d{4}).*?(\d{1,2})')
if data["licenceDate"]:
re_serch = pattern.search(data["licenceDate"])
if re_serch:
start_year, start_month, start_day = re_serch.group(1), re_serch.group(2), re_serch.group(3)
start_month, start_day = start_month.rjust(2, '0'), start_day.rjust(2, '0')
data["licenceDate"] = start_year + "-" + start_month + "-" + start_day + " 00:00:00"
else:
try:
re_serch = pattern_without_day.search(data["licenceDate"])
start_year, start_month = re_serch.group(1), re_serch.group(2)
start_month = start_month.rjust(2, '0')
data["licenceDate"] = start_year + "-" + start_month + "-01 00:00:00"
except:
pass
# data['bindingHouse'] = i.find_element_by_xpath(".//td[3]").text
bindingHouse = i.xpath(".//td[3]/text()").extract()
data['bindingHouse'] = "".join(bindingHouse)
# if not data['bindingHouse'] and not data["licenceDate"] and not data['budgetLicence']:
# continue
# data_lists.append(data)
presale.append(data)
# presale = json.dumps(presale, ensure_ascii=False)
all_comment_dict.update({"presale": presale})
yield all_comment_dict
# ---------------预售证----------------------
basic_information = response.xpath("//div[@class='main-left']")
for i in basic_information:
# 基本信息
# item['_id'] = url # 楼盘url
item['source'] = "房天下" # 来源
item['allstatus'] = "1" # 采集状态
price = i.xpath('./div[1]//em/text()').extract() # 均价
price = ''.join(price)
try:
price = price.replace("\n", "")
price = price.replace("\t", "")
price = price.replace(" ", "")
except Exception as e:
pass
try:
item['houseprice'] = re.search(r"\d+.*", price, re.S).group() # 取出数字及后面的字
except Exception as e:
item['houseprice'] = "待定"
book_list = soup.find(attrs={"class": "main-left"})
book_list_name = book_list.find_all("li")
data_dict = {}
for i in book_list_name:
key = i.find(attrs={"class": "list-left"})
try:
key = key.text
except Exception as e:
pass
value = i.find(attrs={"class": ["list-right", "list-right-text", "list-right-floor"]}) # 获取两个class名
try:
value = value.text
except Exception as e:
pass
try:
key = key.replace(" ", "")
key = key.replace("\n", "")
key = key.replace("\t", "")
except Exception as e:
pass
try:
value = value.replace("\n", "")
# value = value.replace(" ", ",")
value = value.replace("\t", "")
except Exception as e:
pass
data_dict.update({key: value})
# 基本信息
if "物业类别:" in data_dict.keys():
item['houseatr'] = data_dict["物业类别:"]
item['houseatr'] = item['houseatr'].replace(",", "")
item['houseatr'] = item['houseatr'].replace(" ", "")
if "建筑类别:" in data_dict.keys():
item['housetype'] = data_dict["建筑类别:"]
item['housetype'] = item['housetype'].replace(" ", ",")
elif "写字楼级别:" in data_dict.keys():
item['housetype'] = data_dict["写字楼级别:"]
# item['housetype'] = item['housetype'].replace(" ", ",")
if "产权年限:" in data_dict.keys():
item['years'] = data_dict["产权年限:"]
item['years'] = item['years'].replace(",", "")
if "装修状况:" in data_dict.keys():
item['decoration'] = data_dict["装修状况:"]
if "开发商:" in data_dict.keys():
item['developer'] = data_dict["开发商:"]
if "楼盘地址:" in data_dict.keys():
item['houseaddress'] = data_dict["楼盘地址:"]
# 销售信息
if "销售状态:" in data_dict.keys():
item['salestatus'] = data_dict["销售状态:"]
item['salestatus'] = item['salestatus'].replace(" ", "")
if "开盘时间:" in data_dict.keys():
item['startSaleString'] = data_dict["开盘时间:"]
if "交房时间:" in data_dict.keys():
item['endSaleString'] = data_dict["交房时间:"]
if "售楼地址:" in data_dict.keys():
item['saleaddress'] = data_dict["售楼地址:"]
# 小区规划
if "占地面积:" in data_dict.keys():
landarea = data_dict["占地面积:"]
data_re = re.findall(r"\d+", landarea, re.S) # 取出数字
item['landarea'] = ("".join(data_re)) # 列表转字符串
if "建筑面积:" in data_dict.keys():
housearea = data_dict["建筑面积:"]
data_re = re.findall(r"[\d\.]+", housearea, re.S) # 取出数字
item['housearea'] = ("".join(data_re)) # 列表转字符串
if "容积率:" in data_dict.keys():
item['plotratio'] = data_dict["容积率:"]
item['plotratio'] = ''.join(item['plotratio'].split())
if "绿化率:" in data_dict.keys():
item['greenrate'] = re.sub(r'\%', '', data_dict["绿化率:"]) # 去掉%
if item['greenrate'] == "暂无资料":
item['greenrate'] = None
if "停车位:" in data_dict.keys():
item['carsite'] = data_dict["停车位:"]
try:
item['carsite'] = item['carsite'].replace("\r", "")
item['carsite'] = item['carsite'].replace("\n", "")
item['carsite'] = item['carsite'].replace("\t", "")
item['carsite'] = item['carsite'].replace(" ", "")
except Exception as e:
pass
elif "停车位配置:" in data_dict.keys():
item['carsite'] = data_dict["停车位配置:"]
try:
item['carsite'] = item['carsite'].replace("\r", "")
item['carsite'] = item['carsite'].replace("\n", "")
item['carsite'] = item['carsite'].replace("\t", "")
item['carsite'] = item['carsite'].replace(" ", "")
except Exception as e:
pass
if "楼栋总数:" in data_dict.keys():
housecount = data_dict["楼栋总数:"]
data_re = re.findall(r"\d+", housecount, re.S) # 取出数字
item['housecount'] = ("".join(data_re)) # 列表转字符串
item['housecount'] = item['housecount'].replace(" ", "")
elif "楼栋情况:" in data_dict.keys():
item['housecount'] = data_dict["楼栋情况:"]
item['housecount'] = item['housecount'].replace(" ", "")
if "总户数:" in data_dict.keys():
allcount = data_dict["总户数:"]
data_re = re.findall(r"\d+", allcount, re.S) # 取出数字
item['allcount'] = ("".join(data_re)) # 列表转字符串
if "物业公司:" in data_dict.keys():
item['managecompany'] = data_dict["物业公司:"]
if "物业费:" in data_dict.keys():
item['managefee'] = data_dict["物业费:"]
item['managefee'] = "".join(item['managefee'].split()) # 去掉\xa0字符
if "楼层状况:" in data_dict.keys():
item['floorCondition'] = data_dict["楼层状况:"]
item['fetch_time'] = str(datetime.now()) # 获取当前时间
pattern = re.compile(r'(\d{4}).*?(\d{1,2}).*?(\d{1,2})')
pattern_without_day = re.compile(r'(\d{4}).*?(\d{1,2})')
if item['startSaleString']:
re_serch = pattern.search(item["startSaleString"])
if re_serch:
start_year, start_month, start_day = re_serch.group(1), re_serch.group(2), re_serch.group(3)
start_month, start_day = start_month.rjust(2, '0'), start_day.rjust(2, '0')
item["startsaletime"] = start_year + "-" + start_month + "-" + start_day + " 00:00:00"
else:
try:
re_serch = pattern_without_day.search(item["startSaleString"])
start_year, start_month = re_serch.group(1), re_serch.group(2)
start_month = start_month.rjust(2, '0')
item["startsaletime"] = start_year + "-" + start_month + "-01 00:00:00"
except:
pass
if item["endSaleString"]:
re_serch = pattern.search(item["endSaleString"])
if re_serch:
start_year, start_month, start_day = re_serch.group(1), re_serch.group(2), re_serch.group(3)
start_month, start_day = start_month.rjust(2, '0'), start_day.rjust(2, '0')
item["endsaletime"] = start_year + "-" + start_month + "-" + start_day + " 00:00:00"
else:
try:
re_serch = pattern_without_day.search(item["endSaleString"])
start_year, start_month = re_serch.group(1), re_serch.group(2)
start_month = start_month.rjust(2, '0')
item["endsaletime"] = start_year + "-" + start_month + "-" + "-01 00:00:00"
except:
pass
for key, value in item.items():
if value and value.endswith(","):
item[key] = value[:-1]
if value and type(value) == str and '[' in value: # 去掉[]内的内容
item[key] = re.sub(r'[^\w]?\[.*?\]', '', value)
yield item