Python中dataframe数据的基本操作、索引的变换、行列选择、简单描述

关注微xin号：小程在线

关注CSDN博客：程志伟的博客

import numpy as np

import pandas as pd

#### 1.1简单数据操作 ####

#生成数据

df1=pd.DataFrame(np.array([[10,20],[30,40]]))

df1

Out[2]:

0 1

0 10 20

1 30 40

df2 = pd.DataFrame([pd.Series(np.arange(1, 8)),pd.Series(np.arange(11, 18))])

df2

Out[3]:

0 1 2 3 4 5 6

0 1 2 3 4 5 6 7

1 11 12 13 14 15 16 17

#查看数据的维度

df1.shape

Out[4]: (2, 2)

df2.shape

Out[5]: (2, 7)

#增加索引名称

df3=pd.DataFrame(np.array([[10,20],[30,40]]),index=['a','b'],columns=['c1','c2'])

df3

Out[6]:

c1 c2

a 10 20

b 30 40

#查看列的名字

df3.columns

Out[7]: Index(['c1', 'c2'], dtype='object')

df3.columns[1]

Out[8]: 'c2'

#对列名重新定义

df3.columns=['column1','column2']

df3

Out[9]:

column1 column2

a 10 20

b 30 40

#查看索引的名字

df3.index

Out[10]: Index(['a', 'b'], dtype='object')

#重新创建数据

s1 = pd.Series(np.arange(1, 9, 2))

s2 = pd.Series(np.arange(2, 10, 2))

df4=pd.DataFrame({'c1': s1, 'c2': s2})

df4

Out[11]:

c1 c2

0 1 2

1 3 4

2 5 6

3 7 8

#按照相同索引进行数据合并

s3 = pd.Series(np.arange(5, 7), index=[1, 2])

df5 = pd.DataFrame({'c1': s1, 'c2': s2, 'c3': s3})

df5

Out[12]:

c1 c2 c3

0 1 2 NaN

1 3 4 5.0

2 5 6 6.0

3 7 8 NaN

#查看df5的信息

df5.info()

Int64Index: 4 entries, 0 to 3

Data columns (total 3 columns):

c1 4 non-null int32

c2 4 non-null int32

c3 2 non-null float64

dtypes: float64(1), int32(2)

memory usage: 96.0 bytes

ca5d9402-ae3e-4ff4-a43f-d723cf31febb 转存失败重新上传取消

##### 1.2 read_csv读取数据 ####

eu12=pd.read_csv('H:/0date/Eueo2012.csv',index_col='Team')

eu12

Out[14]:

Goals Shots on target ... Subs off Players Used

Team ...

Croatia 4 13 ... 9 16

Czech Republic 4 13 ... 11 19

Denmark 4 10 ... 7 15

England 5 11 ... 11 16

France 3 22 ... 11 19

Germany 10 32 ... 15 17

Greece 5 8 ... 12 20

Italy 6 34 ... 18 19

Netherlands 2 12 ... 7 15

Poland 2 15 ... 7 17

Portugal 6 22 ... 14 16

Republic of Ireland 1 7 ... 10 17

Russia 5 9 ... 7 16

Spain 12 42 ... 17 18

Sweden 5 17 ... 9 18

Ukraine 2 7 ... 9 18

[16 rows x 34 columns]

#查看数据的前五行

eu12.head(5)

Out[15]:

Goals Shots on target ... Subs off Players Used

Team ...

Croatia 4 13 ... 9 16

Czech Republic 4 13 ... 11 19

Denmark 4 10 ... 7 15

England 5 11 ... 11 16

France 3 22 ... 11 19

[5 rows x 34 columns]

#查看数据的后五行

eu12.tail(5)

Out[16]:

Goals Shots on target ... Subs off Players Used

Team ...

Republic of Ireland 1 7 ... 10 17

Russia 5 9 ... 7 16

Spain 12 42 ... 17 18

Sweden 5 17 ... 9 18

Ukraine 2 7 ... 9 18

[5 rows x 34 columns]

#查看数据的行数

len(eu12)

Out[17]: 16

#查看数据的维度

eu12.shape

Out[18]: (16, 34)

#查看数据的行数

eu12.shape[0]

Out[19]: 16

#查看数据的索引

eu12.index

Out[20]:

Index(['Croatia', 'Czech Republic', 'Denmark', 'England', 'France', 'Germany',

'Greece', 'Italy', 'Netherlands', 'Poland', 'Portugal',

'Republic of Ireland', 'Russia', 'Spain', 'Sweden', 'Ukraine'],

dtype='object', name='Team')

#查看数据的列名

eu12.columns

Out[21]:

Index(['Goals', 'Shots on target', 'Shots off target', 'Shooting Accuracy',

'% Goals-to-shots', 'Total shots (inc. Blocked)', 'Hit Woodwork',

'Penalty goals', 'Penalties not scored', 'Headed goals', 'Passes',

'Passes completed', 'Passing Accuracy', 'Touches', 'Crosses',

'Dribbles', 'Corners Taken', 'Tackles', 'Clearances', 'Interceptions',

'Clearances off line', 'Clean Sheets', 'Blocks', 'Goals conceded',

'Saves made', 'Saves-to-shots ratio', 'Fouls Won', 'Fouls Conceded',

'Offsides', 'Yellow Cards', 'Red Cards', 'Subs on', 'Subs off',

'Players Used'],

dtype='object')

#查看指定列的前五行

eu12[['Shots on target','Shots off target']].head()

Out[22]:

Shots on target Shots off target

Team

Croatia 13 12

Czech Republic 13 18

Denmark 10 10

England 11 18

France 22 24

#查看列的位置

eu12.columns.get_loc('Red Cards')

Out[23]: 30

#数据的复制

eu12_goals=eu12.copy()

eu12_goals

Out[24]:

Goals Shots on target ... Subs off Players Used

Team ...

Croatia 4 13 ... 9 16

Czech Republic 4 13 ... 11 19

Denmark 4 10 ... 7 15

England 5 11 ... 11 16

France 3 22 ... 11 19

Germany 10 32 ... 15 17

Greece 5 8 ... 12 20

Italy 6 34 ... 18 19

Netherlands 2 12 ... 7 15

Poland 2 15 ... 7 17

Portugal 6 22 ... 14 16

Republic of Ireland 1 7 ... 10 17

Russia 5 9 ... 7 16

Spain 12 42 ... 17 18

Sweden 5 17 ... 9 18

Ukraine 2 7 ... 9 18

[16 rows x 34 columns]

#数据的前五行

eu12[:5]

Out[25]:

Goals Shots on target ... Subs off Players Used

Team ...

Croatia 4 13 ... 9 16

Czech Republic 4 13 ... 11 19

Denmark 4 10 ... 7 15

England 5 11 ... 11 16

France 3 22 ... 11 19

[5 rows x 34 columns]

#数据的特定行

eu12['Croatia':'Denmark']

Out[26]:

Goals Shots on target ... Subs off Players Used

Team ...

Croatia 4 13 ... 9 16

Czech Republic 4 13 ... 11 19

Denmark 4 10 ... 7 15

[3 rows x 34 columns]

#使用loc查看数据的特定行

eu12.loc['Denmark']

Out[27]:

Goals 4

Shots on target 10

Shots off target 10

Shooting Accuracy 50.0%

% Goals-to-shots 20.0%

Total shots (inc. Blocked) 27

Hit Woodwork 1

Penalty goals 0

Penalties not scored 0

Headed goals 3

Passes 1298

Passes completed 1082

Passing Accuracy 83.3%

Touches 1873

Crosses 43

Dribbles 32

Corners Taken 16

Tackles 40

Clearances 61

Interceptions 59

Clearances off line 0

Clean Sheets 1

Blocks 10

Goals conceded 5

Saves made 10

Saves-to-shots ratio 66.7%

Fouls Won 25

Fouls Conceded 38

Offsides 8

Yellow Cards 4

Red Cards 0

Subs on 7

Subs off 7

Players Used 15

Name: Denmark, dtype: object

eu12.loc[['Denmark','France']]

Out[28]:

Goals Shots on target ... Subs off Players Used

Team ...

Denmark 4 10 ... 7 15

France 3 22 ... 11 19

[2 rows x 34 columns]

#使用iloc查看数据的特定行

eu12.iloc[[2,4]]

Out[29]:

Goals Shots on target ... Subs off Players Used

Team ...

Denmark 4 10 ... 7 15

France 3 22 ... 11 19

[2 rows x 34 columns]

#查看特定索引的位数

eu12.index.get_loc('France')

Out[30]: 4

#使用ix查看数据的特定行,可以使用数字与列名

eu12.ix[[2,4]]

Out[31]:

Goals Shots on target ... Subs off Players Used

Team ...

Denmark 4 10 ... 7 15

France 3 22 ... 11 19

[2 rows x 34 columns]

eu12.ix[['Denmark','France']]

Out[32]:

Goals Shots on target ... Subs off Players Used

Team ...

Denmark 4 10 ... 7 15

France 3 22 ... 11 19

[2 rows x 34 columns]

#使用at查看数据数值

eu12.at['France','Goals']

Out[33]: 3

eu12[2:4][['Goals','Shots on target']]

Out[34]:

Goals Shots on target

Team

Denmark 4 10

England 5 11

#布尔值判断

eu12.Goals>4

Out[35]:

Team

Croatia False

Czech Republic False

Denmark False

England True

France False

Germany True

Greece True

Italy True

Netherlands False

Poland False

Portugal True

Republic of Ireland False

Russia True

Spain True

Sweden True

Ukraine False

Name: Goals, dtype: bool

#布尔值选择

eu12[eu12.Goals>4]

Out[36]:

Goals Shots on target ... Subs off Players Used

Team ...

England 5 11 ... 11 16

Germany 10 32 ... 15 17

Greece 5 8 ... 12 20

Italy 6 34 ... 18 19

Portugal 6 22 ... 14 16

Russia 5 9 ... 7 16

Spain 12 42 ... 17 18

Sweden 5 17 ... 9 18

[8 rows x 34 columns]

#布尔值选择（与）

eu12_goals[(eu12_goals.Goals>4)&(eu12_goals.Touches>2000 )]

Out[37]:

Goals Shots on target ... Subs off Players Used

Team ...

England 5 11 ... 11 16

Germany 10 32 ... 15 17

Greece 5 8 ... 12 20

Italy 6 34 ... 18 19

Portugal 6 22 ... 14 16

Russia 5 9 ... 7 16

Spain 12 42 ... 17 18

[7 rows x 34 columns]

#### 1.3 修改dataframe ###

#使用rename对列名重命名，返回新的数据

eu12_goals.rename(columns={'Goals':'goals'})

Out[38]:

goals Shots on target ... Subs off Players Used

Team ...

Croatia 4 13 ... 9 16

Czech Republic 4 13 ... 11 19

Denmark 4 10 ... 7 15

England 5 11 ... 11 16

France 3 22 ... 11 19

Germany 10 32 ... 15 17

Greece 5 8 ... 12 20

Italy 6 34 ... 18 19

Netherlands 2 12 ... 7 15

Poland 2 15 ... 7 17

Portugal 6 22 ... 14 16

Republic of Ireland 1 7 ... 10 17

Russia 5 9 ... 7 16

Spain 12 42 ... 17 18

Sweden 5 17 ... 9 18

Ukraine 2 7 ... 9 18

[16 rows x 34 columns]

#使用inplace改变原来的数据集

eu12_goals.rename(columns={'Goals':'goals'},inplace=True)

eu12_goals

Out[39]:

goals Shots on target ... Subs off Players Used

Team ...

Croatia 4 13 ... 9 16

Czech Republic 4 13 ... 11 19

Denmark 4 10 ... 7 15

England 5 11 ... 11 16

France 3 22 ... 11 19

Germany 10 32 ... 15 17

Greece 5 8 ... 12 20

Italy 6 34 ... 18 19

Netherlands 2 12 ... 7 15

Poland 2 15 ... 7 17

Portugal 6 22 ... 14 16

Republic of Ireland 1 7 ... 10 17

Russia 5 9 ... 7 16

Spain 12 42 ... 17 18

Sweden 5 17 ... 9 18

Ukraine 2 7 ... 9 18

[16 rows x 34 columns]

#使用insert插入新的列

eu12_goals.insert(1,'on_target_percent',eu12_goals.Touches/100)

eu12_goals

Out[40]:

goals on_target_percent ... Subs off Players Used

Team ...

Croatia 4 17.06 ... 9 16

Czech Republic 4 23.58 ... 11 19

Denmark 4 18.73 ... 7 15

England 5 24.40 ... 11 16

France 3 29.09 ... 11 19

Germany 10 37.61 ... 15 17

Greece 5 20.16 ... 12 20

Italy 6 43.63 ... 18 19

Netherlands 2 21.63 ... 7 15

Poland 2 17.24 ... 7 17

Portugal 6 29.58 ... 14 16

Republic of Ireland 1 14.33 ... 10 17

Russia 5 22.78 ... 7 16

Spain 12 55.85 ... 17 18

Sweden 5 18.06 ... 9 18

Ukraine 2 18.94 ... 9 18

[16 rows x 35 columns]

#在最后一列生成新的数据

eu12_goals['on_target_ratio']=eu12_goals.Touches/(eu12_goals.Touches+eu12_goals.Touches)

eu12_goals

Out[41]:

goals on_target_percent ... Players Used on_target_ratio

Team ...

Croatia 4 17.06 ... 16 0.5

Czech Republic 4 23.58 ... 19 0.5

Denmark 4 18.73 ... 15 0.5

England 5 24.40 ... 16 0.5

France 3 29.09 ... 19 0.5

Germany 10 37.61 ... 17 0.5

Greece 5 20.16 ... 20 0.5

Italy 6 43.63 ... 19 0.5

Netherlands 2 21.63 ... 15 0.5

Poland 2 17.24 ... 17 0.5

Portugal 6 29.58 ... 16 0.5

Republic of Ireland 1 14.33 ... 17 0.5

Russia 5 22.78 ... 16 0.5

Spain 12 55.85 ... 18 0.5

Sweden 5 18.06 ... 18 0.5

Ukraine 2 18.94 ... 18 0.5

[16 rows x 36 columns]

#使用del删除数据

del eu12_goals['on_target_ratio']

eu12_goals

Out[42]:

goals on_target_percent ... Subs off Players Used

Team ...

Croatia 4 17.06 ... 9 16

Czech Republic 4 23.58 ... 11 19

Denmark 4 18.73 ... 7 15

England 5 24.40 ... 11 16

France 3 29.09 ... 11 19

Germany 10 37.61 ... 15 17

Greece 5 20.16 ... 12 20

Italy 6 43.63 ... 18 19

Netherlands 2 21.63 ... 7 15

Poland 2 17.24 ... 7 17

Portugal 6 29.58 ... 14 16

Republic of Ireland 1 14.33 ... 10 17

Russia 5 22.78 ... 7 16

Spain 12 55.85 ... 17 18

Sweden 5 18.06 ... 9 18

Ukraine 2 18.94 ... 9 18

[16 rows x 35 columns]

#pop函数，显示删掉的数据

eu12_goals.pop('Players Used')

Out[43]:

Team

Croatia 16

Czech Republic 19

Denmark 15

England 16

France 19

Germany 17

Greece 20

Italy 19

Netherlands 15

Poland 17

Portugal 16

Republic of Ireland 17

Russia 16

Spain 18

Sweden 18

Ukraine 18

Name: Players Used, dtype: int64

#drop删除数据，不改变原来的数据

eu12_goals.drop(['on_target_percent'],axis=1)

Out[44]:

goals Shots on target ... Subs on Subs off

Team ...

Croatia 4 13 ... 9 9

Czech Republic 4 13 ... 11 11

Denmark 4 10 ... 7 7

England 5 11 ... 11 11

France 3 22 ... 11 11

Germany 10 32 ... 15 15

Greece 5 8 ... 12 12

Italy 6 34 ... 18 18

Netherlands 2 12 ... 7 7

Poland 2 15 ... 7 7

Portugal 6 22 ... 14 14

Republic of Ireland 1 7 ... 10 10

Russia 5 9 ... 7 7

Spain 12 42 ... 17 17

Sweden 5 17 ... 9 9

Ukraine 2 7 ... 9 9

[16 rows x 33 columns]

#### 1.4 修改dataframe的列 ####

df1=eu12_goals.iloc[:5]

print(df1)

goals on_target_percent ... Subs on Subs off

Team ...

Croatia 4 17.06 ... 9 9

Czech Republic 4 23.58 ... 11 11

Denmark 4 18.73 ... 7 7

England 5 24.40 ... 11 11

France 3 29.09 ... 11 11

[5 rows x 34 columns]

df2=eu12_goals.iloc[[10,11,12]]

print(df2)

goals on_target_percent ... Subs on Subs off

Team ...

Portugal 6 29.58 ... 14 14

Republic of Ireland 1 14.33 ... 10 10

Russia 5 22.78 ... 7 7

[3 rows x 34 columns]

#行叠加

df3=df1.append(df2)

df3

Out[47]:

goals on_target_percent ... Subs on Subs off

Team ...

Croatia 4 17.06 ... 9 9

Czech Republic 4 23.58 ... 11 11

Denmark 4 18.73 ... 7 7

England 5 24.40 ... 11 11

France 3 29.09 ... 11 11

Portugal 6 29.58 ... 14 14

Republic of Ireland 1 14.33 ... 10 10

Russia 5 22.78 ... 7 7

[8 rows x 34 columns]

df4=pd.DataFrame(2,index=df1.index,columns=['changes'])

df4

Out[48]:

% Goals-to-shots Blocks ... goals on_target_percent

Team ...

Croatia 16.0% 10.0 ... 4.0 17.06

Czech Republic 12.9% 10.0 ... 4.0 23.58

Denmark 20.0% 10.0 ... 4.0 18.73

England 17.2% 29.0 ... 5.0 24.40

France 6.5% 7.0 ... 3.0 29.09

Croatia NaN NaN ... NaN NaN

Czech Republic NaN NaN ... NaN NaN

Denmark NaN NaN ... NaN NaN

England NaN NaN ... NaN NaN

France NaN NaN ... NaN NaN

[10 rows x 35 columns]

df1.append(df4,ignore_index=True) #忽略index,直接添加

Out[49]:

% Goals-to-shots Blocks Clean Sheets ... changes goals on_target_percent

0 16.0% 10.0 0.0 ... NaN 4.0 17.06

1 12.9% 10.0 1.0 ... NaN 4.0 23.58

2 20.0% 10.0 1.0 ... NaN 4.0 18.73

3 17.2% 29.0 2.0 ... NaN 5.0 24.40

4 6.5% 7.0 1.0 ... NaN 3.0 29.09

5 NaN NaN NaN ... 2.0 NaN NaN

6 NaN NaN NaN ... 2.0 NaN NaN

7 NaN NaN NaN ... 2.0 NaN NaN

8 NaN NaN NaN ... 2.0 NaN NaN

9 NaN NaN NaN ... 2.0 NaN NaN

[10 rows x 35 columns]

#concat连接

pd.concat([df1,df2])

Out[50]:

goals on_target_percent ... Subs on Subs off

Team ...

Croatia 4 17.06 ... 9 9

Czech Republic 4 23.58 ... 11 11

Denmark 4 18.73 ... 7 7

England 5 24.40 ... 11 11

France 3 29.09 ... 11 11

Portugal 6 29.58 ... 14 14

Republic of Ireland 1 14.33 ... 10 10

Russia 5 22.78 ... 7 7

[8 rows x 34 columns]

#删除行

df1.drop(['France'])

Out[51]:

goals on_target_percent ... Subs on Subs off

Team ...

Croatia 4 17.06 ... 9 9

Czech Republic 4 23.58 ... 11 11

Denmark 4 18.73 ... 7 7

England 5 24.40 ... 11 11

[4 rows x 34 columns]

#### 1.5 修改dataframe的index ####

sp500=pd.read_csv('H:/0date/sp500.csv')

#修改索引

sp500=sp500.set_index('Symbol')

sp500.head()

Out[53]:

Name ... SEC Filings

Symbol ...

MMM 3M Co. ... http://www.sec.gov/cgi-bin/browse-edgar?action...

ABT Abbott Laboratories ... http://www.sec.gov/cgi-bin/browse-edgar?action...

ABBV AbbVie Inc. ... http://www.sec.gov/cgi-bin/browse-edgar?action...

ACN Accenture ... http://www.sec.gov/cgi-bin/browse-edgar?action...

ACE ACE Limited ... http://www.sec.gov/cgi-bin/browse-edgar?action...

[5 rows x 14 columns]

#查看ACE行数据

sp500.loc['ACE']

Out[54]:

Name ACE Limited

Sector Financials

Price 102.91

Dividend Yield 2.21

Price/Earnings 10

Earnings/Share 10.293

Book Value 86.897

52 week low 84.73

52 week high 104.07

Market Cap 34.753

EBITDA 4.275

Price/Sales 1.79

Price/Book 1.18

SEC Filings http://www.sec.gov/cgi-bin/browse-edgar?action...

Name: ACE, dtype: object

#还原index

sp500=sp500.reset_index()

sp500.head()

Out[55]:

Symbol ... SEC Filings

0 MMM ... http://www.sec.gov/cgi-bin/browse-edgar?action...

1 ABT ... http://www.sec.gov/cgi-bin/browse-edgar?action...

2 ABBV ... http://www.sec.gov/cgi-bin/browse-edgar?action...

3 ACN ... http://www.sec.gov/cgi-bin/browse-edgar?action...

4 ACE ... http://www.sec.gov/cgi-bin/browse-edgar?action...

[5 rows x 15 columns]

#查看ACE的行数据

sp500.iloc[4]

Out[56]:

Symbol ACE

Name ACE Limited

Sector Financials

Price 102.91

Dividend Yield 2.21

Price/Earnings 10

Earnings/Share 10.293

Book Value 86.897

52 week low 84.73

52 week high 104.07

Market Cap 34.753

EBITDA 4.275

Price/Sales 1.79

Price/Book 1.18

SEC Filings http://www.sec.gov/cgi-bin/browse-edgar?action...

Name: 4, dtype: object

#### 1.6 修改dataframe的多维index ####

np.arrays = [['one','one','one','two','two','two'],[1,2,3,1,2,3]]

df = pd.DataFrame(np.random.randn(6,2),index=pd.MultiIndex.from_tuples(list(zip(*np.arrays))),columns=['A','B'])

Out[57]:

A B

one 1 0.500047 -2.090758

2 0.087545 -0.609452

3 0.213795 -0.680843

two 1 -1.436421 -0.957349

2 -0.819979 1.782266

3 -0.526769 -1.879333

#使用ix查看数据的特定行

df.ix["one"]

Out[58]:

A B

1 0.500047 -2.090758

2 0.087545 -0.609452

3 0.213795 -0.680843

df.ix["one"].ix[1]

Out[59]:

A 0.500047

B -2.090758

Name: 1, dtype: float64

#使用xs查看数据的特定行

df.xs("one")

Out[60]:

A B

1 0.500047 -2.090758

2 0.087545 -0.609452

3 0.213795 -0.680843

#取单列

df.xs("B",axis=1)

Out[61]:

one 1 -2.090758

2 -0.609452

3 -0.680843

two 1 -0.957349

2 1.782266

3 -1.879333

Name: B, dtype: float64

#### 1.7 多维数据多维索引 ####

raw_data = {'school': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'],

'class': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'],

'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'],

'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],

'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 70, 62, 70]}

df = pd.DataFrame(raw_data, columns = ['school', 'class', 'name', 'preTestScore', 'postTestScore'])

Out[62]:

school class name preTestScore postTestScore

0 Nighthawks 1st Miller 4 25

1 Nighthawks 1st Jacobson 24 94

2 Nighthawks 2nd Ali 31 57

3 Nighthawks 2nd Milner 2 62

4 Dragoons 1st Cooze 3 70

5 Dragoons 1st Jacon 4 25

6 Dragoons 2nd Ryaner 24 94

7 Dragoons 2nd Sone 31 57

8 Scouts 1st Sloan 2 62

9 Scouts 1st Piger 3 70

10 Scouts 2nd Riani 2 62

11 Scouts 2nd Ali 3 70

#设置多索引

df=df.set_index(['school','class'])

df.index

Out[63]:

MultiIndex(levels=[['Dragoons', 'Nighthawks', 'Scouts'], ['1st', '2nd']],

codes=[[1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2], [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]],

names=['school', 'class'])

#对数据求和

df.sum(level='school')

Out[64]:

preTestScore postTestScore

school

Nighthawks 61 238

Dragoons 62 246

Scouts 10 264

df.ix['Dragoons']

Out[65]:

name preTestScore postTestScore

class

1st Cooze 3 70

1st Jacon 4 25

2nd Ryaner 24 94

2nd Sone 31 57

#重新定义数据

df.swaplevel('class','school')

Out[66]:

name preTestScore postTestScore

class school

1st Nighthawks Miller 4 25

Nighthawks Jacobson 24 94

2nd Nighthawks Ali 31 57

Nighthawks Milner 2 62

1st Dragoons Cooze 3 70

Dragoons Jacon 4 25

2nd Dragoons Ryaner 24 94

Dragoons Sone 31 57

1st Scouts Sloan 2 62

Scouts Piger 3 70

2nd Scouts Riani 2 62

Scouts Ali 3 70

df = pd.DataFrame(np.random.random((4,4)))

df.columns = pd.MultiIndex.from_product([[1,2],['A','B']])

Out[67]:

1 2

A B A B

0 0.008531 0.159603 0.865589 0.526770

1 0.433940 0.792874 0.036432 0.509819

2 0.211002 0.400251 0.261920 0.491058

3 0.416693 0.371853 0.287564 0.225101

#选择特定的列

df.iloc[:, df.columns.get_level_values(1)=='A']

Out[68]:

1 2

A A

0 0.008531 0.865589

1 0.433940 0.036432

2 0.211002 0.261920

3 0.416693 0.287564

#对数据进行排序

eu12.sort_values(['Red Cards', 'Yellow Cards'], ascending = False)[['Red Cards', 'Yellow Cards']]

Out[69]:

Red Cards Yellow Cards

Team

Greece 1 9

Poland 1 7

Republic of Ireland 1 6

Italy 0 16

Portugal 0 12

Spain 0 11

Croatia 0 9

Czech Republic 0 7

Sweden 0 7

France 0 6

Russia 0 6

England 0 5

Netherlands 0 5

Ukraine 0 5

Denmark 0 4

Germany 0 4

eu12.sort_values(['Red Cards', 'Yellow Cards'], ascending = True)[['Red Cards', 'Yellow Cards']]

Out[70]:

Red Cards Yellow Cards

Team

Denmark 0 4

Germany 0 4

England 0 5

Netherlands 0 5

Ukraine 0 5

France 0 6

Russia 0 6

Czech Republic 0 7

Sweden 0 7

Croatia 0 9

Spain 0 11

Portugal 0 12

Italy 0 16

Republic of Ireland 1 6

Poland 1 7

Greece 1 9

#按照索引进行降序

sp500.sort_index(ascending=False)

Out[71]:

Symbol ... SEC Filings

499 ZTS ... http://www.sec.gov/cgi-bin/browse-edgar?action...

498 ZION ... http://www.sec.gov/cgi-bin/browse-edgar?action...

497 ZMH ... http://www.sec.gov/cgi-bin/browse-edgar?action...

496 YUM ... http://www.sec.gov/cgi-bin/browse-edgar?action...

495 YHOO ... http://www.sec.gov/cgi-bin/browse-edgar?action...

...

4 ACE ... http://www.sec.gov/cgi-bin/browse-edgar?action...

3 ACN ... http://www.sec.gov/cgi-bin/browse-edgar?action...

2 ABBV ... http://www.sec.gov/cgi-bin/browse-edgar?action...

1 ABT ... http://www.sec.gov/cgi-bin/browse-edgar?action...

0 MMM ... http://www.sec.gov/cgi-bin/browse-edgar?action...

[500 rows x 15 columns]

#### 1.8 数据的描述 ####

one_mon_hist = pd.read_csv('H:/0date/omh.csv')

one_mon_hist

Out[73]:

Date MSFT AAPL

0 2014-12-01 48.62 115.07

1 2014-12-02 48.46 114.63

2 2014-12-03 48.08 115.93

3 2014-12-04 48.84 115.49

4 2014-12-05 48.42 115.00

5 2014-12-08 47.70 112.40

6 2014-12-09 47.59 114.12

7 2014-12-10 46.90 111.95

8 2014-12-11 47.17 111.62

9 2014-12-12 46.95 109.73

10 2014-12-15 46.67 108.23

11 2014-12-16 45.16 106.75

12 2014-12-17 45.74 109.41

13 2014-12-18 47.52 112.65

14 2014-12-19 47.66 111.78

15 2014-12-22 47.98 112.94

16 2014-12-23 48.45 112.54

17 2014-12-24 48.14 112.01

18 2014-12-26 47.88 113.99

19 2014-12-29 47.45 113.91

20 2014-12-30 47.02 112.52

21 2014-12-31 46.45 110.38

#每列的平均值

one_mon_hist.mean()

Out[74]:

MSFT 47.493182

AAPL 112.411364

dtype: float64

#每行的平均值

one_mon_hist.mean(axis=1) # row

Out[75]:

0 81.845

1 81.545

2 82.005

3 82.165

4 81.710

5 80.050

6 80.855

7 79.425

8 79.395

9 78.340

10 77.450

11 75.955

12 77.575

13 80.085

14 79.720

15 80.460

16 80.495

17 80.075

18 80.935

19 80.680

20 79.770

21 78.415

dtype: float64

#方差

one_mon_hist.var()

Out[76]:

MSFT 0.870632

AAPL 5.706231

dtype: float64

#中位数

one_mon_hist.median()

Out[77]:

MSFT 47.625

AAPL 112.530

dtype: float64

#每行的最小值、最大值

one_mon_hist[['MSFT', 'AAPL']].min()

Out[78]:

MSFT 45.16

AAPL 106.75

dtype: float64

one_mon_hist[['MSFT', 'AAPL']].max()

Out[79]:

MSFT 48.84

AAPL 115.93

dtype: float64

#每行的最小值、最大值的位置

one_mon_hist[['MSFT', 'AAPL']].idxmin()

Out[80]:

MSFT 11

AAPL 11

dtype: int64

one_mon_hist[['MSFT', 'AAPL']].idxmax()

Out[81]:

MSFT 3

AAPL 2

dtype: int64

#整体描述

one_mon_hist.describe()

Out[82]:

MSFT AAPL

count 22.000000 22.000000

mean 47.493182 112.411364

std 0.933077 2.388772

min 45.160000 106.750000

25% 46.967500 111.660000

50% 47.625000 112.530000

75% 48.125000 114.087500

max 48.840000 115.930000

#字符串数据

s = pd.Series(['a', 'a', 'b', 'c', np.NaN])

s.describe()

Out[84]:

count 4

unique 3

top a

freq 2

dtype: object

#数量统计

s.count()

Out[85]: 4

#唯一性统计

s.unique()

Out[86]: array(['a', 'b', 'c', nan], dtype=object)

程志伟

发布了89 篇原创文章 · 获赞 109 · 访问量 37万+

私信关注

Python中dataframe数据的基本操作、索引的变换、行列选择、简单描述

猜你喜欢