代码已久,有可能需要调整
#coding:utf-8 from bs4 import BeautifulSoup #有这个bs4不用正则也可以定位要爬取的内容了 from urlparse import urljoin import requests import csv import html5lib URL = 'http://hn.ganji.com/fang1/' #爬取的目标地址 ADDR = 'http://hn.ganji.com/' if __name__ == '__name__' start_page = 1 #开始爬取的页面 end_page = 10 #结束爬取的页面 price = 7 #爬取的价格 #f = open('ganji.csv','wb')这样打开一个文件最后需要关闭 with open('ganji.csv','wb') as f: #创建一个csv文件,with ... as f表示打开文件最后用完自动关闭,一般和打开文件一起用 #delimiter=','表示以逗号为分隔符,如:'天通苑一区','天通苑','1500' csv_writer = csv.writer(f,delimiter = ',') print('start..........') while start_page <= end_page: start_page += 1 print('get:{0}'.format(URL.format(page = start_page,price = price)))#开始爬取 response = requests.get(URL.format(page = start_page,price = price))#获取页面 html = BeautifulSoup(response.text,'html.parser') #第一个参数是要抓取的html文本,第二个是使用哪种解析器(python默认的解析器) house_list = html.select('.f-list > .f-list-item > .f-list-item-wrap') #括号里面的表示html标签的层次,具体可查看赶集网 #获取房源信息,也可以用正则区匹配 if not house_list: break for house in house_list: house_title = house.select('.title > a')[0].string.encode('utf-8') #[0]取列表的第一个标签 house_addr = house.select('.address > .area > a')[-1].string.encode('utf-8') #[-1]取列表的第一个标签 house_price = house.select('.info > .price > .num')[0].string.encode('utf-8') house_url = urljoin(ADDR,house.select('.title > a')[0]['href']) csv_writer.writerow([house_title,house_addr,house_price,house_url]) print('end.........')