对USDA食品数据库进行分析,data请见GitHub。
以下主要分为:1.导入数据并检查数据结构;2.整理info表格;3.整理nutrients表格;4.合并两表进行分析。
#导入数据
# 导入数据
import json
db = json.load(open(r'data\usda_food\database.json'))
# 检查数据
db[0].keys()
# dict_keys(['id', 'description', 'tags', 'manufacturer', 'group', 'portions', 'nutrients'])
# 检查数据
db[0]
{
'id': 1008,
'description': 'Cheese, caraway',
'tags': [],
'manufacturer': '',
'group': 'Dairy and Egg Products',
'portions': [{
'amount': 1, 'unit': 'oz', 'grams': 28.35}],
'nutrients': [{
'value': 25.18,
'units': 'g',
'description': 'Protein',
'group': 'Composition'},.....
#整合info表
# 抽取一部分信息,建立info
sub=['id','description','manufacturer','group']
info = DataFrame(db,columns=sub)
info.info()
# Column Non-Null Count Dtype
#--- ------ -------------- -----
# 0 id 6636 non-null int64
# 1 description 6636 non-null object
# 2 manufacturer 5195 non-null object
# 3 group 6636 non-null object
# 通过value_counts 查看食物类别的分布情况
info.group.value_counts()[:5]
#Vegetables and Vegetable Products 812
#Beef Products 618
#Baked Products 496
#Breakfast Cereals 403
#Fast Foods 365
#整合nutrients表
# 将所有食物的营养成分整合到一张大表中nutrients
nutrients = []
for rec in db:
frame = DataFrame(rec['nutrients'])
frame['id'] = rec['id']
nutrients.append(frame)
nutrients = pd.concat(nutrients)
nutrients
# 丢弃重复项
nutrients.duplicated().sum()
nutrients = nutrients.drop_duplicates()
#对全部营养数据做分析
# 重命名1
col_mapping = {
'description':'food','group':'fgroup'}
info = info.rename(columns=col_mapping,copy=True)
# 重命名2
col_mapping = {
'description':'nutrient','group':'nutgroup'}
nutrients = nutrients.rename(columns=col_mapping,copy=True)
# 将info跟nutrients两个表格合并
ndata = pd.merge(info,nutrients,on='id',how='outer')
ndata[:5]
# 根据营养类型和食物分类画出中位值图
result = ndata.groupby(['nutrient','fgroup'])['value'].quantile(0.5)
result
#nutrient fgroup
#Adjusted Protein Sweets #12.900
# Vegetables and Vegetable Products 2.180
#Alanine Baby Foods 0.085
# Baked Products 0.248
# Beef Products 1.550
# #...
#Zinc, Zn Snacks 1.470
# Soups, Sauces, and Gravies 0.200
# Spices and Herbs 2.750
# Sweets 0.360
# Vegetables and Vegetable Products 0.330
#Name: value, Length: 2246, dtype: float64
# 画出锌的中位值
result['Zinc, Zn'].sort_values().plot(kind='barh')
plt.savefig('Zinc, Zn.jpg',bbox_inches='tight')
# 找出各类最丰富的的食物
get_max = lambda x:x.xs(x.value.idxmax())
max_food = ndata.groupby(['nutgroup','nutrient']).apply(get_max)
max_food = max_food[['food','value']]
max_food