python学习--解析网页

# -*- coding: utf-8 -*-
"""
Created on Thu Oct 17 14:04:21 2019

@author: DELL
"""
"""
BeautifulSoup解析数据
lxml是个容错率高效的解析器

"""
import requests
from bs4 import BeautifulSoup
import pandas as pd


def get_urls(n): #定义一个函数取页数
print("start")

urls = []
for i in range(1,n+1): #遍历每页
urls.append('https://travel.qunar.com/p-cs299878-shanghai-jingdian-1-%s' %i)
return urls


"""
获取每页数据

"""
def get_data(url):
r = requests.get(url)
soup = BeautifulSoup(r.text,'lxml') #解析网页
ul = soup.find('ul',class_='list_item clrfix') #解析好网页后查找
lis = ul.find_all('li') #获取所有的li标签
print(1)

datalst = [] #c创建一个列表存放所有
print(datalst)
print(2)

for li in lis:
dic={} #创建一个空的字典存放遍历数据
print(3)
dic['lng'] = li['data_lng'] #每次循环采集经度这个数据,data_lng是属性名
dic['lat'] = li['data_lat']
dic['景点名称'] = li.find('span',class_='cn_tit').text #.text方法获取具体标签下的元素
dic['星级'] = li.find('span',class_='cur_star')['style'].split(':')[1].replace('%','') #获取属性用[]
datalst.append(dic)


"""
构建函数获取所有的页数的数据
遍历url网址获取每一页数据,将数据存放在alldata中
"""

def get_alldata(n):
alldata = []
for url in get_urls(n):
alldata.extend(get_data(url))
return alldata


get_alldata(7)

df = pd.DataFrame(get_alldata)
df.to_csv('C:/Users/DELL/Desktop/资料/data.cvs')










猜你喜欢

转载自www.cnblogs.com/xixirang/p/11840649.html