!/usr/bin/env python
-- coding:utf-8 --
"""
项目名称:全球食品数据分析
项目参考:https://www.kaggle.com/bhouwens/d/openfoodfacts/world-food-facts/how-much-sugar-do-we-eat/discussion
"""
import zipfile
import os
import pandas as pd
import matplotlib.pyplot as plt
#得到zip文件中的第一个文件名
def get_dataset_filename(zip_filepath):
with zipfile.ZipFile(zip_filepath) as zf:
return zf.namelist()[0]
def unzip(zip_filepath,dest_path):
with zipfile.ZipFile(zip_filepath) as zf:
zf.extractall(path=dest_path)
def run_main():
# 数据采集路径
dataset_path = "../data"
#zip文件名
zip_filename = "open-food-facts.zip"
# zip文件路径
zip_filepath = os.path.join(dataset_path,zip_filename)
dataset_filename = get_dataset_filename(zip_filepath);
dataset_filepath = os.path.join(dataset_path, dataset_filename)
print("解压zip....",end="")
#unzip(zip_filepath,dataset_path)
print("解压完成!")
#读取这两列countries_en additives_n数据
data = pd.read_csv(dataset_filepath,usecols=['countries_en','additives_n'])
#1、数据清理
#去除缺失数据
data = data.dropna()
#将国家名称统一转化为小写 或者大写,便于groupby
data['countries_en'] = data['countries_en'].str.lower()
#2、数据分组统计
#country_additives = data['additives_n'].groupby()
country_additives = data.groupby('countries_en').sum()
# print(type(country_additives))
# print(country_additives.index)
# print(country_additives.values)
#3、经过观察发现'countries_en'中的数值不是单独的国家名称,有的是多个国家名称用逗号隔开,如 Albania,Belgium,France,Germany,Italy,Netherlands,Spain,正确的统计应该是将这些值拆开成多个行记录,然后进行分组统计
#循环索引
for country_index in country_additives.index:
#拆分索引
if len(country_index.split(',')) > 1:
for ct in country_index.split(','):
#获取行索引对应的值,value 是 Dataframe对象
value = country_additives.loc[[country_index]]
#重命名行索引名称
value.rename(index={country_index:ct}, inplace=True)
#把value添加到country_additives中
country_additives=country_additives.append(value)
#把拆分后的对象都添加到country_additives中后,删除country_index行
country_additives.drop([country_index],inplace=True)
#4、再次groupby
country_data =country_additives.groupby('countries_en').sum()
#5、排序
result = country_data.sort_values(by='additives_n',ascending=False)
#6、pandas 可视化 top10
result.iloc[:10].plot.bar()
plt.show()
#5、保存处理结果
result.to_csv("./country_additives.csv")
if __name__ == '__main__':
run_main()