import requests
import bs4
import time
import random
import pandas as pd
import os
house_info=[]
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"
}
# for i in range(1,50):
for i in range(1,5):
url="https://cs.fang.anjuke.com/loupan/all/p"+str(i)+"/#filtersort"
print("开始爬取安居客平台长沙新房第%s页信息....." %(str(i)))
response = requests.get(url=url, headers=headers)
# response = requests.get(url=url)
if not os.path.exists('anjukecs/'):
os.mkdir('anjukecs/')
with open('anjukecs/page{}.html'.format(i), 'a+', encoding='utf-8') as f:
# print(driver.page_source.encode('utf-8'))
f.write(str(response.text))
#生成bs4对象
bsoup=bs4.BeautifulSoup(response.text,'lxml')
house_list=bsoup.find_all('div',class_="infos")
for house in house_list:
#bs4解析文件
titile = house.find('a').text.strip()
try:
house_type = house.find('a', class_='huxing').text.replace('\t', '').replace('\n', '').strip()
except:
house_type = ''
try:
area = house.find('span', class_='building-area').text
except:
area = ''
try:
address = house.find('a', class_='address').span.text.replace(" ","").strip()
except:
address = ''
pd1= pd.DataFrame({'titile': titile, 'house_type': house_type,
'area': area, 'address': address},index=[0])
house_info.append(pd1)
second=random.randrange(3,5)
time.sleep(second)
house_info2=pd.concat(house_info)
house_info2.to_excel('cs_house_info.xlsx',index=False)
爬取安居客长沙新房的位置、户型、面积等信息。