分享一下这本书的原始数据数据
# -*- coding: utf-8 -*-
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
'''
分析美国农业部食品数据库
'''
import json
db=json.load(open('C:\\tools\\pydata-book-master\\ch07\\foods-2011-10-03.json'))
print(len(db))
#db的每一条目录都含有某种食品的所有数据字典,nutrients字段是一个字典列表,其中每一个字典对应一种养分
#print(db[1])
#获取数据中字典的key值
#print(db[0].keys())
#dict_keys(['id', 'description', 'tags', 'manufacturer', 'group', 'portions', 'nutrients'])
#获取nutrients下的第一条数据
#print(db[0]['nutrients'][0])
#{'value': 25.18, 'units': 'g', 'description': 'Protein', 'group': 'Composition'}
nutrients=DataFrame(db[0]['nutrients'])
#print(nutrients)
#获取商品的片段,只取出食品的名称,分类,编号,制造商
#设置列名
info_keys=['description','group','id','manufacturer']
info=DataFrame(db,columns=info_keys)
#print(info[:5])
#可以通过value_counts查看食品的分布情况
#print(pd.value_counts(info.group))
# Vegetables and Vegetable Products 812
# Beef Products 618
# Baked Products 496
# Breakfast Cereals 403
# Fast Foods 365
# Legumes and Legume Products 365
# Lamb, Veal, and Game Products 345
# Sweets 341
# Pork Products 328
# Fruits and Fruit Juices 328
# Beverages 278
# Soups, Sauces, and Gravies 275
# Finfish and Shellfish Products 255
# Baby Foods 209
# Cereal Grains and Pasta 183
# Ethnic Foods 165
# Snacks 162
# Nut and Seed Products 128
# Poultry Products 116
# Sausages and Luncheon Meats 111
# Dairy and Egg Products 107
# Fats and Oils 97
# Meals, Entrees, and Sidedishes 57
# Restaurant Foods 51
# Spices and Herbs 41
'''
对营养数据进行分析最简单的方法就是,将所有营养数据整合成一张大表 concat
'''
nutrients=[]
for rec in db:
fnuts=DataFrame(rec['nutrients'])
#给fnuts表添加一列数据,将每条的id添加到fnuts的id
fnuts['id']=rec['id']
nutrients.append(fnuts)
#将所有的表整合到一起
nutrients=pd.concat(nutrients,ignore_index=True)
#print(nutrients)
#查看是否有重复项
#print(nutrients.duplicated().sum())
#删除重复项
nutrients=nutrients.drop_duplicates()
# print(nutrients)
'''
因为大的Dataframe和小的里面中都有gruop和description,这里要重命名 rename
'''
col_mapping={'description':'food','group':'fgroup'}
info=info.rename(columns=col_mapping,copy=False)
#print(info.describe())
col_mapping={'description':'nutrient','group':'nutgroup'}
nutrients=nutrients.rename(columns=col_mapping,copy=False)
'''
做完这些事情后我们就可以把info表和nutrients表合并起来了,merge
'''
ndata=pd.merge(nutrients,info,on='id',how='outer')#以id为连接键,显示nan值,将nutrients和info表结合
#获取其中一条数据
print(ndata.ix[30000])
# nutrient Glycine
# nutgroup Amino Acids
# units g
# value 0.04
# id 6158
# food Soup, tomato bisque, canned, condensed
# fgroup Soups, Sauces, and Gravies
# manufacturer
# Name: 30000, dtype: object
每天进步一点点