这个bug忙了我一下午加一个晚上,终于把它ko掉了
先附上一段爬取安居客二手房信息的代码
import re
import time
import pymongo
import requests
from bson import ObjectId
from lxml import etree
from pprint import pprint
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
"cookie": "aQQ_ajkguid=243E5D58-8B13-D7BD-4922-3DE583E03855; ctid=11; _ga=GA1.2.1030980732.1530799904; _gid=GA1.2.506397644.1530799904; 58tj_uuid=c606f59a-2fb9-4c91-9815-741fdf9cfe5d; als=0; lps=http%3A%2F%2Fwww.anjuke.com%2F%3Fpi%3DPZ-baidu-pc-all-biaoti%7Chttps%3A%2F%2Fwww.baidu.com%2Fs%3Fie%3Dutf-8%26f%3D8%26rsv_bp%3D0%26rsv_idx%3D1%26tn%3Dbaidu%26wd%3D%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%26rsv_pq%3Dd71198bd000395ca%26rsv_t%3D6172VDlcx2zzRQ%252FLyCdcEidtafr%252BSvVyVXrlZ0lsK3U1MEz8066IF4byz4c%26rqlang%3Dcn%26rsv_enter%3D1%26rsv_sug3%3D5%26rsv_sug1%3D5%26rsv_sug7%3D101; twe=2; sessid=3497C1D2-43A8-6143-B2D7-CFDA33FF0C0E; new_uv=2; __xsptplus8=8.2.1530840314.1530840335.2%232%7Cwww.baidu.com%7C%7C%7C%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%7C%23%23Z7v3XnqLDcxTHeMLiqLXQSLHvXrh8k_R%23",
"referer": "https://shanghai.anjuke.com/?pi=PZ-baidu-pc-all-biaoti"
}
# 连接数据库
client = pymongo.MongoClient('127.0.0.1', 27017)
# 定义数据库名称
db = client.anjuke
# 定义表名
coll = db.ershoufang
def get_info():
count = 0
for i in range(23):
response = requests.get('https://shanghai.anjuke.com/sale/p{}/#filtersort'.format(i), headers=headers)
item = response.text
# print(item)
# 利用etree.HTML,将字符串解析为HTML文档
html = etree.HTML(item)
htmls = html.xpath('//*[@id="houselist-mod-new"]/li')
# print(htmls)
house = {}
for h in htmls:
h_addr = h.xpath('./div[2]/div[1]/a/text()')[0].strip()
h_type = h.xpath('./div[2]/div[2]/span[1]/text()')[0].strip()
h_area = h.xpath('./div[2]/div[2]/span[2]/text()')[0].strip()
h_hight = h.xpath('./div[2]/div[2]/span[3]/text()')[0].strip()
h_name = h.xpath('./div[2]/div[2]/span[4]/text()')[0].strip()
try:
h_youshi1 = h.xpath('./div[2]/div[4]/span[1]/text()')[0].strip()
except:
h_youshi1=None
try:
h_youshi2 = h.xpath('./div[2]/div[4]/span[2]/text()')[0].strip()
except:
h_youshi2=None
try:
h_youshi3 = h.xpath('./div[2]/div[4]/span[3]/text()')[0].strip()
except:
h_youshi3=None
h_price = h.xpath('./div[3]/span[1]/strong/text()')[0].strip()
house['h_addr']=h_addr
house['h_type']=h_type
house['h_area']=h_area
house['h_hight']=h_hight
house['h_name']=h_name
house['h_youshi1']=h_youshi1
house['h_youshi2']=h_youshi2
house['h_youshi3']=h_youshi3
house['h_price']=h_price
# pprint(house)
time.sleep(0.01)
# coll.insert(house)
save(house)
count+=1
print(count)
def save(house):
coll.insert(house)
def main():
get_info()
if __name__ == '__main__':
main()
这个代码只能运行两条数据,
两条数据,一条有‘_id’,一条没有
目前有两条解决方案:
一:在程序中加一个‘_id’,这个字段,自己设置_id字段,代替系统分配:
程序没问题:
二:将house={},这个字典放在for循环里面:
这两种方法都可以解决问题,个人建议还是方法二,代码规范,让系统自己分配id