关注微xin号:小程在线
关注CSDN博客:程志伟的博客
import numpy as np
import pandas as pd
#### 1.1简单数据操作 ####
#生成数据
df1=pd.DataFrame(np.array([[10,20],[30,40]]))
df1
Out[2]:
0 1
0 10 20
1 30 40
df2 = pd.DataFrame([pd.Series(np.arange(1, 8)),pd.Series(np.arange(11, 18))])
df2
Out[3]:
0 1 2 3 4 5 6
0 1 2 3 4 5 6 7
1 11 12 13 14 15 16 17
#查看数据的维度
df1.shape
Out[4]: (2, 2)
df2.shape
Out[5]: (2, 7)
#增加索引名称
df3=pd.DataFrame(np.array([[10,20],[30,40]]),index=['a','b'],columns=['c1','c2'])
df3
Out[6]:
c1 c2
a 10 20
b 30 40
#查看列的名字
df3.columns
Out[7]: Index(['c1', 'c2'], dtype='object')
df3.columns[1]
Out[8]: 'c2'
#对列名重新定义
df3.columns=['column1','column2']
df3
Out[9]:
column1 column2
a 10 20
b 30 40
#查看索引的名字
df3.index
Out[10]: Index(['a', 'b'], dtype='object')
#重新创建数据
s1 = pd.Series(np.arange(1, 9, 2))
s2 = pd.Series(np.arange(2, 10, 2))
df4=pd.DataFrame({'c1': s1, 'c2': s2})
df4
Out[11]:
c1 c2
0 1 2
1 3 4
2 5 6
3 7 8
#按照相同索引进行数据合并
s3 = pd.Series(np.arange(5, 7), index=[1, 2])
df5 = pd.DataFrame({'c1': s1, 'c2': s2, 'c3': s3})
df5
Out[12]:
c1 c2 c3
0 1 2 NaN
1 3 4 5.0
2 5 6 6.0
3 7 8 NaN
#查看df5的信息
df5.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 3
Data columns (total 3 columns):
c1 4 non-null int32
c2 4 non-null int32
c3 2 non-null float64
dtypes: float64(1), int32(2)
memory usage: 96.0 bytes
##### 1.2 read_csv读取数据 ####
eu12=pd.read_csv('H:/0date/Eueo2012.csv',index_col='Team')
eu12
Out[14]:
Goals Shots on target ... Subs off Players Used
Team ...
Croatia 4 13 ... 9 16
Czech Republic 4 13 ... 11 19
Denmark 4 10 ... 7 15
England 5 11 ... 11 16
France 3 22 ... 11 19
Germany 10 32 ... 15 17
Greece 5 8 ... 12 20
Italy 6 34 ... 18 19
Netherlands 2 12 ... 7 15
Poland 2 15 ... 7 17
Portugal 6 22 ... 14 16
Republic of Ireland 1 7 ... 10 17
Russia 5 9 ... 7 16
Spain 12 42 ... 17 18
Sweden 5 17 ... 9 18
Ukraine 2 7 ... 9 18
[16 rows x 34 columns]
#查看数据的前五行
eu12.head(5)
Out[15]:
Goals Shots on target ... Subs off Players Used
Team ...
Croatia 4 13 ... 9 16
Czech Republic 4 13 ... 11 19
Denmark 4 10 ... 7 15
England 5 11 ... 11 16
France 3 22 ... 11 19
[5 rows x 34 columns]
#查看数据的后五行
eu12.tail(5)
Out[16]:
Goals Shots on target ... Subs off Players Used
Team ...
Republic of Ireland 1 7 ... 10 17
Russia 5 9 ... 7 16
Spain 12 42 ... 17 18
Sweden 5 17 ... 9 18
Ukraine 2 7 ... 9 18
[5 rows x 34 columns]
#查看数据的行数
len(eu12)
Out[17]: 16
#查看数据的维度
eu12.shape
Out[18]: (16, 34)
#查看数据的行数
eu12.shape[0]
Out[19]: 16
#查看数据的索引
eu12.index
Out[20]:
Index(['Croatia', 'Czech Republic', 'Denmark', 'England', 'France', 'Germany',
'Greece', 'Italy', 'Netherlands', 'Poland', 'Portugal',
'Republic of Ireland', 'Russia', 'Spain', 'Sweden', 'Ukraine'],
dtype='object', name='Team')
#查看数据的列名
eu12.columns
Out[21]:
Index(['Goals', 'Shots on target', 'Shots off target', 'Shooting Accuracy',
'% Goals-to-shots', 'Total shots (inc. Blocked)', 'Hit Woodwork',
'Penalty goals', 'Penalties not scored', 'Headed goals', 'Passes',
'Passes completed', 'Passing Accuracy', 'Touches', 'Crosses',
'Dribbles', 'Corners Taken', 'Tackles', 'Clearances', 'Interceptions',
'Clearances off line', 'Clean Sheets', 'Blocks', 'Goals conceded',
'Saves made', 'Saves-to-shots ratio', 'Fouls Won', 'Fouls Conceded',
'Offsides', 'Yellow Cards', 'Red Cards', 'Subs on', 'Subs off',
'Players Used'],
dtype='object')
#查看指定列的前五行
eu12[['Shots on target','Shots off target']].head()
Out[22]:
Shots on target Shots off target
Team
Croatia 13 12
Czech Republic 13 18
Denmark 10 10
England 11 18
France 22 24
#查看列的位置
eu12.columns.get_loc('Red Cards')
Out[23]: 30
#数据的复制
eu12_goals=eu12.copy()
eu12_goals
Out[24]:
Goals Shots on target ... Subs off Players Used
Team ...
Croatia 4 13 ... 9 16
Czech Republic 4 13 ... 11 19
Denmark 4 10 ... 7 15
England 5 11 ... 11 16
France 3 22 ... 11 19
Germany 10 32 ... 15 17
Greece 5 8 ... 12 20
Italy 6 34 ... 18 19
Netherlands 2 12 ... 7 15
Poland 2 15 ... 7 17
Portugal 6 22 ... 14 16
Republic of Ireland 1 7 ... 10 17
Russia 5 9 ... 7 16
Spain 12 42 ... 17 18
Sweden 5 17 ... 9 18
Ukraine 2 7 ... 9 18
[16 rows x 34 columns]
#数据的前五行
eu12[:5]
Out[25]:
Goals Shots on target ... Subs off Players Used
Team ...
Croatia 4 13 ... 9 16
Czech Republic 4 13 ... 11 19
Denmark 4 10 ... 7 15
England 5 11 ... 11 16
France 3 22 ... 11 19
[5 rows x 34 columns]
#数据的特定行
eu12['Croatia':'Denmark']
Out[26]:
Goals Shots on target ... Subs off Players Used
Team ...
Croatia 4 13 ... 9 16
Czech Republic 4 13 ... 11 19
Denmark 4 10 ... 7 15
[3 rows x 34 columns]
#使用loc查看数据的特定行
eu12.loc['Denmark']
Out[27]:
Goals 4
Shots on target 10
Shots off target 10
Shooting Accuracy 50.0%
% Goals-to-shots 20.0%
Total shots (inc. Blocked) 27
Hit Woodwork 1
Penalty goals 0
Penalties not scored 0
Headed goals 3
Passes 1298
Passes completed 1082
Passing Accuracy 83.3%
Touches 1873
Crosses 43
Dribbles 32
Corners Taken 16
Tackles 40
Clearances 61
Interceptions 59
Clearances off line 0
Clean Sheets 1
Blocks 10
Goals conceded 5
Saves made 10
Saves-to-shots ratio 66.7%
Fouls Won 25
Fouls Conceded 38
Offsides 8
Yellow Cards 4
Red Cards 0
Subs on 7
Subs off 7
Players Used 15
Name: Denmark, dtype: object
eu12.loc[['Denmark','France']]
Out[28]:
Goals Shots on target ... Subs off Players Used
Team ...
Denmark 4 10 ... 7 15
France 3 22 ... 11 19
[2 rows x 34 columns]
#使用iloc查看数据的特定行
eu12.iloc[[2,4]]
Out[29]:
Goals Shots on target ... Subs off Players Used
Team ...
Denmark 4 10 ... 7 15
France 3 22 ... 11 19
[2 rows x 34 columns]
#查看特定索引的位数
eu12.index.get_loc('France')
Out[30]: 4
#使用ix查看数据的特定行,可以使用数字与列名
eu12.ix[[2,4]]
Out[31]:
Goals Shots on target ... Subs off Players Used
Team ...
Denmark 4 10 ... 7 15
France 3 22 ... 11 19
[2 rows x 34 columns]
eu12.ix[['Denmark','France']]
Out[32]:
Goals Shots on target ... Subs off Players Used
Team ...
Denmark 4 10 ... 7 15
France 3 22 ... 11 19
[2 rows x 34 columns]
#使用at查看数据数值
eu12.at['France','Goals']
Out[33]: 3
eu12[2:4][['Goals','Shots on target']]
Out[34]:
Goals Shots on target
Team
Denmark 4 10
England 5 11
#布尔值判断
eu12.Goals>4
Out[35]:
Team
Croatia False
Czech Republic False
Denmark False
England True
France False
Germany True
Greece True
Italy True
Netherlands False
Poland False
Portugal True
Republic of Ireland False
Russia True
Spain True
Sweden True
Ukraine False
Name: Goals, dtype: bool
#布尔值选择
eu12[eu12.Goals>4]
Out[36]:
Goals Shots on target ... Subs off Players Used
Team ...
England 5 11 ... 11 16
Germany 10 32 ... 15 17
Greece 5 8 ... 12 20
Italy 6 34 ... 18 19
Portugal 6 22 ... 14 16
Russia 5 9 ... 7 16
Spain 12 42 ... 17 18
Sweden 5 17 ... 9 18
[8 rows x 34 columns]
#布尔值选择(与)
eu12_goals[(eu12_goals.Goals>4)&(eu12_goals.Touches>2000 )]
Out[37]:
Goals Shots on target ... Subs off Players Used
Team ...
England 5 11 ... 11 16
Germany 10 32 ... 15 17
Greece 5 8 ... 12 20
Italy 6 34 ... 18 19
Portugal 6 22 ... 14 16
Russia 5 9 ... 7 16
Spain 12 42 ... 17 18
[7 rows x 34 columns]
#### 1.3 修改dataframe ###
#使用rename对列名重命名,返回新的数据
eu12_goals.rename(columns={'Goals':'goals'})
Out[38]:
goals Shots on target ... Subs off Players Used
Team ...
Croatia 4 13 ... 9 16
Czech Republic 4 13 ... 11 19
Denmark 4 10 ... 7 15
England 5 11 ... 11 16
France 3 22 ... 11 19
Germany 10 32 ... 15 17
Greece 5 8 ... 12 20
Italy 6 34 ... 18 19
Netherlands 2 12 ... 7 15
Poland 2 15 ... 7 17
Portugal 6 22 ... 14 16
Republic of Ireland 1 7 ... 10 17
Russia 5 9 ... 7 16
Spain 12 42 ... 17 18
Sweden 5 17 ... 9 18
Ukraine 2 7 ... 9 18
[16 rows x 34 columns]
#使用inplace改变原来的数据集
eu12_goals.rename(columns={'Goals':'goals'},inplace=True)
eu12_goals
Out[39]:
goals Shots on target ... Subs off Players Used
Team ...
Croatia 4 13 ... 9 16
Czech Republic 4 13 ... 11 19
Denmark 4 10 ... 7 15
England 5 11 ... 11 16
France 3 22 ... 11 19
Germany 10 32 ... 15 17
Greece 5 8 ... 12 20
Italy 6 34 ... 18 19
Netherlands 2 12 ... 7 15
Poland 2 15 ... 7 17
Portugal 6 22 ... 14 16
Republic of Ireland 1 7 ... 10 17
Russia 5 9 ... 7 16
Spain 12 42 ... 17 18
Sweden 5 17 ... 9 18
Ukraine 2 7 ... 9 18
[16 rows x 34 columns]
#使用insert插入新的列
eu12_goals.insert(1,'on_target_percent',eu12_goals.Touches/100)
eu12_goals
Out[40]:
goals on_target_percent ... Subs off Players Used
Team ...
Croatia 4 17.06 ... 9 16
Czech Republic 4 23.58 ... 11 19
Denmark 4 18.73 ... 7 15
England 5 24.40 ... 11 16
France 3 29.09 ... 11 19
Germany 10 37.61 ... 15 17
Greece 5 20.16 ... 12 20
Italy 6 43.63 ... 18 19
Netherlands 2 21.63 ... 7 15
Poland 2 17.24 ... 7 17
Portugal 6 29.58 ... 14 16
Republic of Ireland 1 14.33 ... 10 17
Russia 5 22.78 ... 7 16
Spain 12 55.85 ... 17 18
Sweden 5 18.06 ... 9 18
Ukraine 2 18.94 ... 9 18
[16 rows x 35 columns]
#在最后一列生成新的数据
eu12_goals['on_target_ratio']=eu12_goals.Touches/(eu12_goals.Touches+eu12_goals.Touches)
eu12_goals
Out[41]:
goals on_target_percent ... Players Used on_target_ratio
Team ...
Croatia 4 17.06 ... 16 0.5
Czech Republic 4 23.58 ... 19 0.5
Denmark 4 18.73 ... 15 0.5
England 5 24.40 ... 16 0.5
France 3 29.09 ... 19 0.5
Germany 10 37.61 ... 17 0.5
Greece 5 20.16 ... 20 0.5
Italy 6 43.63 ... 19 0.5
Netherlands 2 21.63 ... 15 0.5
Poland 2 17.24 ... 17 0.5
Portugal 6 29.58 ... 16 0.5
Republic of Ireland 1 14.33 ... 17 0.5
Russia 5 22.78 ... 16 0.5
Spain 12 55.85 ... 18 0.5
Sweden 5 18.06 ... 18 0.5
Ukraine 2 18.94 ... 18 0.5
[16 rows x 36 columns]
#使用del删除数据
del eu12_goals['on_target_ratio']
eu12_goals
Out[42]:
goals on_target_percent ... Subs off Players Used
Team ...
Croatia 4 17.06 ... 9 16
Czech Republic 4 23.58 ... 11 19
Denmark 4 18.73 ... 7 15
England 5 24.40 ... 11 16
France 3 29.09 ... 11 19
Germany 10 37.61 ... 15 17
Greece 5 20.16 ... 12 20
Italy 6 43.63 ... 18 19
Netherlands 2 21.63 ... 7 15
Poland 2 17.24 ... 7 17
Portugal 6 29.58 ... 14 16
Republic of Ireland 1 14.33 ... 10 17
Russia 5 22.78 ... 7 16
Spain 12 55.85 ... 17 18
Sweden 5 18.06 ... 9 18
Ukraine 2 18.94 ... 9 18
[16 rows x 35 columns]
#pop函数,显示删掉的数据
eu12_goals.pop('Players Used')
Out[43]:
Team
Croatia 16
Czech Republic 19
Denmark 15
England 16
France 19
Germany 17
Greece 20
Italy 19
Netherlands 15
Poland 17
Portugal 16
Republic of Ireland 17
Russia 16
Spain 18
Sweden 18
Ukraine 18
Name: Players Used, dtype: int64
#drop删除数据,不改变原来的数据
eu12_goals.drop(['on_target_percent'],axis=1)
Out[44]:
goals Shots on target ... Subs on Subs off
Team ...
Croatia 4 13 ... 9 9
Czech Republic 4 13 ... 11 11
Denmark 4 10 ... 7 7
England 5 11 ... 11 11
France 3 22 ... 11 11
Germany 10 32 ... 15 15
Greece 5 8 ... 12 12
Italy 6 34 ... 18 18
Netherlands 2 12 ... 7 7
Poland 2 15 ... 7 7
Portugal 6 22 ... 14 14
Republic of Ireland 1 7 ... 10 10
Russia 5 9 ... 7 7
Spain 12 42 ... 17 17
Sweden 5 17 ... 9 9
Ukraine 2 7 ... 9 9
[16 rows x 33 columns]
#### 1.4 修改dataframe的列 ####
df1=eu12_goals.iloc[:5]
print(df1)
goals on_target_percent ... Subs on Subs off
Team ...
Croatia 4 17.06 ... 9 9
Czech Republic 4 23.58 ... 11 11
Denmark 4 18.73 ... 7 7
England 5 24.40 ... 11 11
France 3 29.09 ... 11 11
[5 rows x 34 columns]
df2=eu12_goals.iloc[[10,11,12]]
print(df2)
goals on_target_percent ... Subs on Subs off
Team ...
Portugal 6 29.58 ... 14 14
Republic of Ireland 1 14.33 ... 10 10
Russia 5 22.78 ... 7 7
[3 rows x 34 columns]
#行叠加
df3=df1.append(df2)
df3
Out[47]:
goals on_target_percent ... Subs on Subs off
Team ...
Croatia 4 17.06 ... 9 9
Czech Republic 4 23.58 ... 11 11
Denmark 4 18.73 ... 7 7
England 5 24.40 ... 11 11
France 3 29.09 ... 11 11
Portugal 6 29.58 ... 14 14
Republic of Ireland 1 14.33 ... 10 10
Russia 5 22.78 ... 7 7
[8 rows x 34 columns]
df4=pd.DataFrame(2,index=df1.index,columns=['changes'])
df4
Out[48]:
% Goals-to-shots Blocks ... goals on_target_percent
Team ...
Croatia 16.0% 10.0 ... 4.0 17.06
Czech Republic 12.9% 10.0 ... 4.0 23.58
Denmark 20.0% 10.0 ... 4.0 18.73
England 17.2% 29.0 ... 5.0 24.40
France 6.5% 7.0 ... 3.0 29.09
Croatia NaN NaN ... NaN NaN
Czech Republic NaN NaN ... NaN NaN
Denmark NaN NaN ... NaN NaN
England NaN NaN ... NaN NaN
France NaN NaN ... NaN NaN
[10 rows x 35 columns]
df1.append(df4,ignore_index=True) #忽略index,直接添加
Out[49]:
% Goals-to-shots Blocks Clean Sheets ... changes goals on_target_percent
0 16.0% 10.0 0.0 ... NaN 4.0 17.06
1 12.9% 10.0 1.0 ... NaN 4.0 23.58
2 20.0% 10.0 1.0 ... NaN 4.0 18.73
3 17.2% 29.0 2.0 ... NaN 5.0 24.40
4 6.5% 7.0 1.0 ... NaN 3.0 29.09
5 NaN NaN NaN ... 2.0 NaN NaN
6 NaN NaN NaN ... 2.0 NaN NaN
7 NaN NaN NaN ... 2.0 NaN NaN
8 NaN NaN NaN ... 2.0 NaN NaN
9 NaN NaN NaN ... 2.0 NaN NaN
[10 rows x 35 columns]
#concat连接
pd.concat([df1,df2])
Out[50]:
goals on_target_percent ... Subs on Subs off
Team ...
Croatia 4 17.06 ... 9 9
Czech Republic 4 23.58 ... 11 11
Denmark 4 18.73 ... 7 7
England 5 24.40 ... 11 11
France 3 29.09 ... 11 11
Portugal 6 29.58 ... 14 14
Republic of Ireland 1 14.33 ... 10 10
Russia 5 22.78 ... 7 7
[8 rows x 34 columns]
#删除行
df1.drop(['France'])
Out[51]:
goals on_target_percent ... Subs on Subs off
Team ...
Croatia 4 17.06 ... 9 9
Czech Republic 4 23.58 ... 11 11
Denmark 4 18.73 ... 7 7
England 5 24.40 ... 11 11
[4 rows x 34 columns]
#### 1.5 修改dataframe的index ####
sp500=pd.read_csv('H:/0date/sp500.csv')
#修改索引
sp500=sp500.set_index('Symbol')
sp500.head()
Out[53]:
Name ... SEC Filings
Symbol ...
MMM 3M Co. ... http://www.sec.gov/cgi-bin/browse-edgar?action...
ABT Abbott Laboratories ... http://www.sec.gov/cgi-bin/browse-edgar?action...
ABBV AbbVie Inc. ... http://www.sec.gov/cgi-bin/browse-edgar?action...
ACN Accenture ... http://www.sec.gov/cgi-bin/browse-edgar?action...
ACE ACE Limited ... http://www.sec.gov/cgi-bin/browse-edgar?action...
[5 rows x 14 columns]
#查看ACE行数据
sp500.loc['ACE']
Out[54]:
Name ACE Limited
Sector Financials
Price 102.91
Dividend Yield 2.21
Price/Earnings 10
Earnings/Share 10.293
Book Value 86.897
52 week low 84.73
52 week high 104.07
Market Cap 34.753
EBITDA 4.275
Price/Sales 1.79
Price/Book 1.18
SEC Filings http://www.sec.gov/cgi-bin/browse-edgar?action...
Name: ACE, dtype: object
#还原index
sp500=sp500.reset_index()
sp500.head()
Out[55]:
Symbol ... SEC Filings
0 MMM ... http://www.sec.gov/cgi-bin/browse-edgar?action...
1 ABT ... http://www.sec.gov/cgi-bin/browse-edgar?action...
2 ABBV ... http://www.sec.gov/cgi-bin/browse-edgar?action...
3 ACN ... http://www.sec.gov/cgi-bin/browse-edgar?action...
4 ACE ... http://www.sec.gov/cgi-bin/browse-edgar?action...
[5 rows x 15 columns]
#查看ACE的行数据
sp500.iloc[4]
Out[56]:
Symbol ACE
Name ACE Limited
Sector Financials
Price 102.91
Dividend Yield 2.21
Price/Earnings 10
Earnings/Share 10.293
Book Value 86.897
52 week low 84.73
52 week high 104.07
Market Cap 34.753
EBITDA 4.275
Price/Sales 1.79
Price/Book 1.18
SEC Filings http://www.sec.gov/cgi-bin/browse-edgar?action...
Name: 4, dtype: object
#### 1.6 修改dataframe的多维index ####
np.arrays = [['one','one','one','two','two','two'],[1,2,3,1,2,3]]
df = pd.DataFrame(np.random.randn(6,2),index=pd.MultiIndex.from_tuples(list(zip(*np.arrays))),columns=['A','B'])
df
Out[57]:
A B
one 1 0.500047 -2.090758
2 0.087545 -0.609452
3 0.213795 -0.680843
two 1 -1.436421 -0.957349
2 -0.819979 1.782266
3 -0.526769 -1.879333
#使用ix查看数据的特定行
df.ix["one"]
Out[58]:
A B
1 0.500047 -2.090758
2 0.087545 -0.609452
3 0.213795 -0.680843
df.ix["one"].ix[1]
Out[59]:
A 0.500047
B -2.090758
Name: 1, dtype: float64
#使用xs查看数据的特定行
df.xs("one")
Out[60]:
A B
1 0.500047 -2.090758
2 0.087545 -0.609452
3 0.213795 -0.680843
#取单列
df.xs("B",axis=1)
Out[61]:
one 1 -2.090758
2 -0.609452
3 -0.680843
two 1 -0.957349
2 1.782266
3 -1.879333
Name: B, dtype: float64
#### 1.7 多维数据多维索引 ####
raw_data = {'school': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'],
'class': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'],
'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'],
'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 70, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['school', 'class', 'name', 'preTestScore', 'postTestScore'])
df
Out[62]:
school class name preTestScore postTestScore
0 Nighthawks 1st Miller 4 25
1 Nighthawks 1st Jacobson 24 94
2 Nighthawks 2nd Ali 31 57
3 Nighthawks 2nd Milner 2 62
4 Dragoons 1st Cooze 3 70
5 Dragoons 1st Jacon 4 25
6 Dragoons 2nd Ryaner 24 94
7 Dragoons 2nd Sone 31 57
8 Scouts 1st Sloan 2 62
9 Scouts 1st Piger 3 70
10 Scouts 2nd Riani 2 62
11 Scouts 2nd Ali 3 70
#设置多索引
df=df.set_index(['school','class'])
df.index
Out[63]:
MultiIndex(levels=[['Dragoons', 'Nighthawks', 'Scouts'], ['1st', '2nd']],
codes=[[1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2], [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]],
names=['school', 'class'])
#对数据求和
df.sum(level='school')
Out[64]:
preTestScore postTestScore
school
Nighthawks 61 238
Dragoons 62 246
Scouts 10 264
df.ix['Dragoons']
Out[65]:
name preTestScore postTestScore
class
1st Cooze 3 70
1st Jacon 4 25
2nd Ryaner 24 94
2nd Sone 31 57
#重新定义数据
df.swaplevel('class','school')
Out[66]:
name preTestScore postTestScore
class school
1st Nighthawks Miller 4 25
Nighthawks Jacobson 24 94
2nd Nighthawks Ali 31 57
Nighthawks Milner 2 62
1st Dragoons Cooze 3 70
Dragoons Jacon 4 25
2nd Dragoons Ryaner 24 94
Dragoons Sone 31 57
1st Scouts Sloan 2 62
Scouts Piger 3 70
2nd Scouts Riani 2 62
Scouts Ali 3 70
df = pd.DataFrame(np.random.random((4,4)))
df.columns = pd.MultiIndex.from_product([[1,2],['A','B']])
df
Out[67]:
1 2
A B A B
0 0.008531 0.159603 0.865589 0.526770
1 0.433940 0.792874 0.036432 0.509819
2 0.211002 0.400251 0.261920 0.491058
3 0.416693 0.371853 0.287564 0.225101
#选择特定的列
df.iloc[:, df.columns.get_level_values(1)=='A']
Out[68]:
1 2
A A
0 0.008531 0.865589
1 0.433940 0.036432
2 0.211002 0.261920
3 0.416693 0.287564
#对数据进行排序
eu12.sort_values(['Red Cards', 'Yellow Cards'], ascending = False)[['Red Cards', 'Yellow Cards']]
Out[69]:
Red Cards Yellow Cards
Team
Greece 1 9
Poland 1 7
Republic of Ireland 1 6
Italy 0 16
Portugal 0 12
Spain 0 11
Croatia 0 9
Czech Republic 0 7
Sweden 0 7
France 0 6
Russia 0 6
England 0 5
Netherlands 0 5
Ukraine 0 5
Denmark 0 4
Germany 0 4
eu12.sort_values(['Red Cards', 'Yellow Cards'], ascending = True)[['Red Cards', 'Yellow Cards']]
Out[70]:
Red Cards Yellow Cards
Team
Denmark 0 4
Germany 0 4
England 0 5
Netherlands 0 5
Ukraine 0 5
France 0 6
Russia 0 6
Czech Republic 0 7
Sweden 0 7
Croatia 0 9
Spain 0 11
Portugal 0 12
Italy 0 16
Republic of Ireland 1 6
Poland 1 7
Greece 1 9
#按照索引进行降序
sp500.sort_index(ascending=False)
Out[71]:
Symbol ... SEC Filings
499 ZTS ... http://www.sec.gov/cgi-bin/browse-edgar?action...
498 ZION ... http://www.sec.gov/cgi-bin/browse-edgar?action...
497 ZMH ... http://www.sec.gov/cgi-bin/browse-edgar?action...
496 YUM ... http://www.sec.gov/cgi-bin/browse-edgar?action...
495 YHOO ... http://www.sec.gov/cgi-bin/browse-edgar?action...
...
4 ACE ... http://www.sec.gov/cgi-bin/browse-edgar?action...
3 ACN ... http://www.sec.gov/cgi-bin/browse-edgar?action...
2 ABBV ... http://www.sec.gov/cgi-bin/browse-edgar?action...
1 ABT ... http://www.sec.gov/cgi-bin/browse-edgar?action...
0 MMM ... http://www.sec.gov/cgi-bin/browse-edgar?action...
[500 rows x 15 columns]
#### 1.8 数据的描述 ####
one_mon_hist = pd.read_csv('H:/0date/omh.csv')
one_mon_hist
Out[73]:
Date MSFT AAPL
0 2014-12-01 48.62 115.07
1 2014-12-02 48.46 114.63
2 2014-12-03 48.08 115.93
3 2014-12-04 48.84 115.49
4 2014-12-05 48.42 115.00
5 2014-12-08 47.70 112.40
6 2014-12-09 47.59 114.12
7 2014-12-10 46.90 111.95
8 2014-12-11 47.17 111.62
9 2014-12-12 46.95 109.73
10 2014-12-15 46.67 108.23
11 2014-12-16 45.16 106.75
12 2014-12-17 45.74 109.41
13 2014-12-18 47.52 112.65
14 2014-12-19 47.66 111.78
15 2014-12-22 47.98 112.94
16 2014-12-23 48.45 112.54
17 2014-12-24 48.14 112.01
18 2014-12-26 47.88 113.99
19 2014-12-29 47.45 113.91
20 2014-12-30 47.02 112.52
21 2014-12-31 46.45 110.38
#每列的平均值
one_mon_hist.mean()
Out[74]:
MSFT 47.493182
AAPL 112.411364
dtype: float64
#每行的平均值
one_mon_hist.mean(axis=1) # row
Out[75]:
0 81.845
1 81.545
2 82.005
3 82.165
4 81.710
5 80.050
6 80.855
7 79.425
8 79.395
9 78.340
10 77.450
11 75.955
12 77.575
13 80.085
14 79.720
15 80.460
16 80.495
17 80.075
18 80.935
19 80.680
20 79.770
21 78.415
dtype: float64
#方差
one_mon_hist.var()
Out[76]:
MSFT 0.870632
AAPL 5.706231
dtype: float64
#中位数
one_mon_hist.median()
Out[77]:
MSFT 47.625
AAPL 112.530
dtype: float64
#每行的最小值、最大值
one_mon_hist[['MSFT', 'AAPL']].min()
Out[78]:
MSFT 45.16
AAPL 106.75
dtype: float64
one_mon_hist[['MSFT', 'AAPL']].max()
Out[79]:
MSFT 48.84
AAPL 115.93
dtype: float64
#每行的最小值、最大值的位置
one_mon_hist[['MSFT', 'AAPL']].idxmin()
Out[80]:
MSFT 11
AAPL 11
dtype: int64
one_mon_hist[['MSFT', 'AAPL']].idxmax()
Out[81]:
MSFT 3
AAPL 2
dtype: int64
#整体描述
one_mon_hist.describe()
Out[82]:
MSFT AAPL
count 22.000000 22.000000
mean 47.493182 112.411364
std 0.933077 2.388772
min 45.160000 106.750000
25% 46.967500 111.660000
50% 47.625000 112.530000
75% 48.125000 114.087500
max 48.840000 115.930000
#字符串数据
s = pd.Series(['a', 'a', 'b', 'c', np.NaN])
s.describe()
Out[84]:
count 4
unique 3
top a
freq 2
dtype: object
#数量统计
s.count()
Out[85]: 4
#唯一性统计
s.unique()
Out[86]: array(['a', 'b', 'c', nan], dtype=object)