1. 运行环境

Python版本：3.7
运行代码需要安装 BeautifulSoup，这里就不再说明如何安装。

2. 代码说明

主要使用 beautifulSoup 解析网页标签。其中58同城的租房页面的价格不知何原因总是显示为未知汉字，自己用函数转换了下。其它都比较简单。

最后，不保证是最优代码，但是可运行。

#!usr/bin/python
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import requests
import csv
import codecs
import sys
from imp import reload
reload(sys)


# url b5是价位，还有b4 b3等价格区间，自己选择。lz 表示 兰州
url = "https://lz.58.com/chuzu/b5/pn{page}"

'''已完成的页数序号，初时为0'''
page = 0

# 保存到本地文件 rent.csv
csv_file = open("rent.csv","w",encoding='utf-8-sig')
csv_writer = csv.writer(csv_file, delimiter=',')

# 将58的价格转成数字
def convertNum(moneychar):
    arr = ['龤','龒','閏','麣','餼','驋','龥','鑶','鸺','齤'] #0-9
    num = ""
    for i in moneychar:
        num += str(arr.index(i))
    return num

# 把每一页都爬下来
while True:
    page += 1
    print("fetch: ", url.format(page=page))
    response = requests.get(url.format(page=page))
    # response.encoding = 'gb10'
    html = BeautifulSoup(response.text, "html.parser")
    house_list = html.select(".house-list > li")

    # print(response.text)

    # 循环在读不到新的房源时结束
    if not house_list:
        print("over...")
        break

    for house in house_list:

        # 根据页面源代码找到对应的标签
        # 标题
        house_title = house.select("h2>a")[0].string
        # 链接
        house_url = house.select("a")[0]["href"]
        # 房源信息
        house_info_list = house_title.split()


        #剔除信息不全的房源
        if(len( house.select(".des>.infor>a"))==1):
            break
        
        # 小区
        house_location = house.select(".des>.infor>a")[1].string
        # 价格
        house_money = house.select(".money")[0].select("b")[0].string
        
        #因为58上价格是未知字符，需要转换
        house_money = convertNum(house_money)
        
        # 写入到 rent.csv
        csv_writer.writerow([house_title, house_location, house_money, house_url])

csv_file.close()

huanqing2010

发布了58 篇原创文章 · 获赞 44 · 访问量 18万+

私信关注

【Python】Python扒取58同城租房信息到本地文件

1. 运行环境

2. 代码说明

猜你喜欢