什么是Selenium
selenium 是一套完整的web应用程序测试系统,包含了测试的录制(selenium IDE),编写及运行(Selenium Remote Control)和测试的并行处理(Selenium Grid)。Selenium的核心Selenium Core基于JsUnit,完全由JavaScript编写,因此可以用于任何支持JavaScript的浏览器上。
selenium不了解可以去百度
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep
import json
import re
class FTspider(object):
def __init__(self):
# page = 1
# start_urls =[base_urls + str(page)]
# print(start_urls)
# options = Options()
# options.set_headless()
# self.driver = webdriver.Chrome(options=options)
self.driver = webdriver.Chrome()
self.num = 1
self.base_urls = "http://nc.newhouse.fang.com/house/s/b9{}".format(self.num)
def xinfang_list(self):
# 获取所有房源
name = self.driver.find_elements_by_xpath('//*[@class="clearfix"]/div/a ')
house_lst = []
# print(name)
for i in name:
href = (i.get_attribute('href'))
# self.driver.get(href)
house_lst.append(href)
data_list = []
for url in house_lst:
self.driver.get(url)
# 获取楼盘动态
try:
fangyuan_url = self.driver.find_element_by_xpath("//*[@class='navleft tf']//a[contains(text(),'动态')]")
except Exception as e:
fangyuan_url = None
href1 = fangyuan_url.get_attribute('href')
self.driver.get(href1)
# 获取动态详情
dongtai_url = self.driver.find_elements_by_xpath('//div[@id="gushi_all"]/ul/li[@id="xflpdt_A02_01"]//p//a')
# dongtai_url = dongtai_url1.get_attribute('href')
if dongtai_url == None:
dongtai_url = None
else:
pass
all_comment_dict = {"_id": url}
dynamicJson = []
floor_class = [j.get_attribute('href') for j in dongtai_url]
for i in floor_class:
self.driver.get(i)
one_dongtai_url = self.driver.find_element_by_xpath("//div[@class='atc-wrapper']")
data = {}
data["source"] = "房天下"
data["title"] = one_dongtai_url.find_element_by_xpath("./h1").text # 标题
if not data["title"]:
continue
time = one_dongtai_url.find_element_by_xpath("./h2").text
data['publishDate'] = re.search(r"\d+.*", time, re.S).group() # 时间
content = one_dongtai_url.find_elements_by_xpath(
".//div[@class='leftboxcom']//p[@style='text-indent:2em;']")
if len(content) !=0:
ori_content = ""
for i in content:
a = i.find_element_by_xpath(".").text
ori_content = ori_content + a + "\n"
data["content"] = ori_content
else:
data["content"] = one_dongtai_url.find_element_by_xpath(
".//div[@class='leftboxcom']|//div[@class='leftboxcom']//a").text # 内容
data_list.append(data)
dynamicJson.append(data)
dynamicJson = json.dumps(dynamicJson, ensure_ascii=False)
all_comment_dict.update({"dynamicJson": dynamicJson})
self.save_data(all_comment_dict)
return data_list
def save_data(self, data_list):
"""保存本地数据"""
with open('动态3100000号终极(南昌).jsonlines', 'a', encoding='utf8') as f:
f.write(json.dumps(data_list, ensure_ascii=False))
f.write('\n')
f.close()
def __del__(self):
# 退出浏览器
self.driver.quit()
# pass
def run(self):
while True:
# get请求浏览网页
self.driver.get(self.base_urls)
# 解析信息
self.xinfang_list(
self.num += 1
self.base_urls = "http://nc.newhouse.fang.com/house/s/b9{}".format(self.num)
if self.num > 16:
break
if __name__ == '__main__':
GJS = FTspider()
GJS.run()