print(food_info)
结果:无 pandas的read_csv是从文件中把内容读取进来
first_rows = food_info.head()
#print (first_rows)
#print(food_info.head(3))
print (food_info.columns)
#print (food_info.shape)
结果:
#Series object representing the row at index 0.
print (food_info.loc[1])
# Series object representing the seventh row.
#food_info.loc[6]
# Will throw an error: "KeyError: 'the label [8620] is not in the [index]'"
#food_info.loc[8620]
#The object dtype is equivalent to a string in Python结果:
NDB_No 1002
Shrt_Desc BUTTER WHIPPED WITH SALT
Water_(g) 15.87
Energ_Kcal 717
Protein_(g) 0.85
Lipid_Tot_(g) 81.11
Ash_(g) 2.11
Carbohydrt_(g) 0.06
Fiber_TD_(g) 0 列头与所取的行
# Returns a DataFrame containing the rows at indexes 3, 4, 5, and 6.
food_info.loc[3:6]
# Returns a DataFrame containing the rows at indexes 2, 5, and 10. Either of the following approaches will work.
# Method 1
#two_five_ten = [2,5,10]
#food_info.loc[two_five_ten]
# Method 2
#food_info.loc[[2,5,10]]
与上面一样的道理
col_names = food_info.columns.tolist()
gram_columns = []
for c in col_names:
if c.endswith("(g)"):
gram_columns.append(c)
gram_df = food_info[gram_columns]
print(gram_df.head(3))结果:
Water_(g) Protein_(g) Lipid_Tot_(g) Ash_(g) Carbohydrt_(g) \(表示分行显示) 0 15.87 0.85 81.11 2.11 0.06 1 15.87 0.85 81.11 2.11 0.06 2 0.24 0.28 99.48 0.00 0.00 Fiber_TD_(g) Sugar_Tot_(g) FA_Sat_(g) FA_Mono_(g) FA_Poly_(g) 0 0 0.06 51.368 21.021 3.043 1 0 0.06 50.489 23.426 3.012 2 0 0.00 61.924 28.732 3.694
#print(food_info["Iron_(mg)"])
div_1000 = food_info["Iron_(mg)"] / 1000
print (div_1000)
# Adds 100 to each value in the column and returns a Series object.
add_100 = food_info["Iron_(mg)"] + 100
# Subtracts 100 from each value in the column and returns a Series object.
#sub_100 = food_info["Iron_(mg)"] - 100
# Multiplies each value in the column by 2 and returns a Series object.
#mult_2 = food_info["Iron_(mg)"]*2结果:
0 0.00002 1 0.00016 2 0.00000 3 0.00031 4 0.00043 5 0.00050 6 0.00033 取出文件中的规定的部分,然后对每一项进行操作+-*/#It applies the arithmetic operator to the first value in both columns, the second value in both columns, and so on
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
iron_grams = food_info["Iron_(mg)"] / 1000
food_info["Iron_(g)"] = iron_grams
print(water_energy)结果:
0 11378.79 1 11378.79 2 210.24 3 14970.73 4 15251.81 5 16172.28 6 15540.00 7 14769.28 8 15062.60 9 14570.55 同上
#By default, pandas will sort the data by the column we specify in ascending order and return a new DataFrame
# Sorts the DataFrame in-place, rather than returning a new DataFrame.
#print food_info["Sodium_(mg)"]
food_info.sort_values("Sodium_(mg)", inplace=True)
print (food_info["Sodium_(mg)"])
#Sorts by descending order, rather than ascending.
food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False)
#print (food_info["Sodium_(mg)"])结果:
760 0 8607 0 629 0 630 0 758 0 6470 0 654 0 8599 0 6463 0 633 0 635 0 一个是安装默认升序,一个是属性设置为false,则按照降序。inplace是指是否在原地方
结合:泰坦尼克号案例强化pandas
import pandas as pd
import numpy as np
titanic_survial = pd.read_csv("C:/Users/LENOVO/Desktop/titanic_train.csv")
titanic_survial.head()
读取部分文件内容展示
#The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
#we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
age = titanic_survial["Age"]
#rint(age.loc[0:10]){取出Age这一列的前10行}
age_is_null = pd.isnull(age)
#print (age_is_null){缺失就是true。存在就是false}
age_null_true = age[age_is_null]
print (age_null_true){找出缺失的位置}
age_null_count = len(age_null_true)
print(age_null_count){统计缺失的个数}
#The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value
titanic_survival[mean_age = sum(titanic_survial["Age"]) / len(titanic_survial["Age"])
print (mean_age)
结果:
print (good_ages)
correct_mean_age = sum(good_ages) / len(good_ages){年龄是正确的求均值}
print (correct_mean_age)结果:
29.6991176471
{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}
#index tells the method which column to group by
#values is the column that we want to apply the calculation to
#aggfunc specifies the calculation we want to perform
passenger_survival = titanic_survial.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)
print (passenger_survival)结果:
Pclass 1 0.629630 2 0.472826 3 0.242363 pandans自己将上面的方法封装了。index和valus是相互对应的,K-V一样。aggfunc是对应之间呈现什么样的关系,这里是求均值
port_stats =titanic_survial.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.sum)
print(port_stats)结果:
row_index_83_pclass = titanic_survial.loc[83,"Pclass"]
print (row_index_83_age)
print (row_index_1000_pclass)结果:
28.0 1 精确的定位到确定的一行到属性new_titanic_survival = titanic_survial.sort_values("Age",ascending=False)
#print new_titanic_survial[0:10]
titanic_reindexed = new_titanic_survival.reset_index(drop=True){重新设置index}
print(titanic_reindexed.iloc[0:10])结果:{loc与iloc是不一样的,loc——通过行标签索引行数据,iloc——通过行号索引行数据,当行号和行标签都是数字时,无区别}
PassengerId Survived Pclass Name Sex \ 0 631 1 1 Barkworth, Mr. Algernon Henry Wilson male 1 852 0 3 Svensson, Mr. Johan male 2 494 0 1 Artagaveytia, Mr. Ramon male 3 97 0 1 Goldschmidt, Mr. George B male 4 117 0 3 Connors, Mr. Patrick male 5 673 0 2 Mitchell, Mr. Henry Michael male 6 746 0 1 Crosby, Capt. Edward Gifford male 7 34 0 2 Wheadon, Mr. Edward H male 8 55 0 1 Ostby, Mr. Engelhart Cornelius male 9 281 0 3 Duane, Mr. Frank male Age SibSp Parch Ticket Fare Cabin Embarked 0 80.0 0 0 27042 30.0000 A23 S 1 74.0 0 0 347060 7.7750 NaN S 2 71.0 0 0 PC 17609 49.5042 NaN C 3 71.0 0 0 PC 17754 34.6542 A5 C 4 70.5 0 0 370369 7.7500 NaN Q 5 70.0 0 0 C.A. 24580 10.5000 NaN S 6 70.0 1 1 WE/P 5735 71.0000 B22 S 7 66.0 0 0 C.A. 24579 10.5000 NaN S 8 65.0 0 1 113509 61.9792 B30 C 9 65.0 0 0 336439 7.7500 NaN Qdef hundredth_row(column):
# Extract the hundredth item
hundredth_item = column.iloc[99]
return hundredth_item
# Return the hundredth item from each column
hundredth_row = titanic_survial. apply(hundredth_row)
print (hundredth_row)结果:
PassengerId 100
Survived 0
Pclass 2
Name Kantor, Mr. Sinai
Sex male
Age 34
SibSp 1
Parch 0
Ticket 244367
Fare 26
Cabin NaN
Embarked S 自定义第100行:但是需要用apply