import pandas as pd
from pandas import Series,DataFrame
import numpy as np
1. Series
1.1 Series的创建
obj = Series([4,7,-5,3])
obj
0 4
1 7
2 -5
3 3
dtype: int64
1.2 Series的索引切片
obj[2]
-5
obj[1:3]
1 7
2 -5
dtype: int64
obj.index
RangeIndex(start=0, stop=4, step=1)
obj.values
array([ 4, 7, -5, 3], dtype=int64)
ships = Series(['企业','拉菲','独角兽','长门','光辉'],index=['a','b','c','d','e'])
ships
a 企业
b 拉菲
c 独角兽
d 长门
e 光辉
dtype: object
ships['c']
'独角兽'
ships[['a','c','e']]
a 企业
c 独角兽
e 光辉
dtype: object
students = {'01':'Rachel','02':'Zack','03':'Daniel'}
obj2 = Series(students)
obj2
01 Rachel
02 Zack
03 Daniel
dtype: object
obj2_new = Series(students,index=['01','02','04'])
obj2_new
'''
在字典中逐个查找是否有与索引列表(Index)相对应的Key值.如果有
'''
01 Rachel
02 Zack
04 NaN
dtype: object
obj = Series(np.arange(4),index=list('abcd'))
obj
a 0
b 1
c 2
d 3
dtype: int32
mask =obj<2
mask
obj[mask]
a 0
b 1
dtype: int32
2. DataFrame
2.1 DataFrame的创建
d2 = np.random.randn(4,3)
df = DataFrame(d2)
df
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
|
0 |
1 |
2 |
0 |
-0.038979 |
-0.241431 |
0.355413 |
1 |
0.742920 |
-1.306156 |
0.254369 |
2 |
-1.010694 |
1.096730 |
-1.198117 |
3 |
-1.936400 |
1.464292 |
0.081845 |
data = {"地区":['上海','北京','江苏','浙江'],
"年份":[2015,2016,2017,2018],
"人口":[0.8,1.2,1.3,1.6]
}
pop = DataFrame(data,columns=["地区","年份","人口",'面积'])
pop
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
|
地区 |
年份 |
人口 |
面积 |
0 |
上海 |
2015 |
0.8 |
NaN |
1 |
北京 |
2016 |
1.2 |
NaN |
2 |
江苏 |
2017 |
1.3 |
NaN |
3 |
浙江 |
2018 |
1.6 |
NaN |
2.2 DataFrame的索引,切片
area = pop['地区']
area
0 上海
1 北京
2 江苏
3 浙江
Name: 地区, dtype: object
pop.人口
0 0.8
1 1.2
2 1.3
3 1.6
Name: 人口, dtype: float64
pop[['地区','人口']]
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
|
地区 |
人口 |
0 |
上海 |
0.8 |
1 |
北京 |
1.2 |
2 |
江苏 |
1.3 |
3 |
浙江 |
1.6 |
pop[1:3]
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
|
地区 |
年份 |
人口 |
面积 |
1 |
北京 |
2016 |
1.2 |
NaN |
2 |
江苏 |
2017 |
1.3 |
NaN |
pop['debt'] = 16.5
pop
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
|
地区 |
年份 |
人口 |
面积 |
debt |
0 |
上海 |
2015 |
0.8 |
NaN |
16.5 |
1 |
北京 |
2016 |
1.2 |
NaN |
16.5 |
2 |
江苏 |
2017 |
1.3 |
NaN |
16.5 |
3 |
浙江 |
2018 |
1.6 |
NaN |
16.5 |
pop['debt'] = np.arange(4)
pop
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
|
地区 |
年份 |
人口 |
面积 |
debt |
0 |
上海 |
2015 |
0.8 |
NaN |
0 |
1 |
北京 |
2016 |
1.2 |
NaN |
1 |
2 |
江苏 |
2017 |
1.3 |
NaN |
2 |
3 |
浙江 |
2018 |
1.6 |
NaN |
3 |
data1 = {"地区":['上海','北京','江苏','浙江'],
"年份":[2015,2016,2017,2018],
"人口":[0.8,1.2,1.3,1.6]
}
pop1 = DataFrame(data,columns=["地区","年份","人口",'面积'])
pop1
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
|
地区 |
年份 |
人口 |
面积 |
0 |
上海 |
2015 |
0.8 |
NaN |
1 |
北京 |
2016 |
1.2 |
NaN |
2 |
江苏 |
2017 |
1.3 |
NaN |
3 |
浙江 |
2018 |
1.6 |
NaN |
area = Series([21,34,56,23],index=[0,1,2,3])
pop['面积'] = area
pop
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
|
地区 |
年份 |
人口 |
面积 |
debt |
0 |
上海 |
2015 |
0.8 |
21 |
0 |
1 |
北京 |
2016 |
1.2 |
34 |
1 |
2 |
江苏 |
2017 |
1.3 |
56 |
2 |
3 |
浙江 |
2018 |
1.6 |
23 |
3 |
pop['人口是否大于一亿'] = pop['人口']>1
pop
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
|
地区 |
年份 |
人口 |
面积 |
debt |
人口是否大于一亿 |
0 |
上海 |
2015 |
0.8 |
21 |
0 |
False |
1 |
北京 |
2016 |
1.2 |
34 |
1 |
True |
2 |
江苏 |
2017 |
1.3 |
56 |
2 |
True |
3 |
浙江 |
2018 |
1.6 |
23 |
3 |
True |
del pop['人口是否大于一亿']
pop
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
|
地区 |
年份 |
人口 |
面积 |
debt |
0 |
上海 |
2015 |
0.8 |
21 |
0 |
1 |
北京 |
2016 |
1.2 |
34 |
1 |
2 |
江苏 |
2017 |
1.3 |
56 |
2 |
3 |
浙江 |
2018 |
1.6 |
23 |
3 |
pop.columns
Index(['地区', '年份', '人口', '面积', 'debt'], dtype='object')
2.2 嵌套字典创建DataFrame
pop = {'mars':{2001:2.4,2002:2.9},'onio':{2000:1.5,1999:3.6}}
frame1 = DataFrame(pop)
frame1
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
|
mars |
onio |
1999 |
NaN |
3.6 |
2000 |
NaN |
1.5 |
2001 |
2.4 |
NaN |
2002 |
2.9 |
NaN |
df = DataFrame(np.arange(16).reshape(4,4),
index = ['安徽','湖南','广东','河南'],
)
df
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
|
0 |
1 |
2 |
3 |
安徽 |
0 |
1 |
2 |
3 |
湖南 |
4 |
5 |
6 |
7 |
广东 |
8 |
9 |
10 |
11 |
河南 |
12 |
13 |
14 |
15 |
'''
df.drop(list,axis)
list 列表 行或者列的名称组成的列表
axis 整型 0,删除行;1,删除列
此方法不修改原数据
'''
df1 = df.drop('河南',axis = 0)
df1
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
|
0 |
1 |
2 |
3 |
安徽 |
0 |
1 |
2 |
3 |
湖南 |
4 |
5 |
6 |
7 |
广东 |
8 |
9 |
10 |
11 |
df2 = df.drop([0,3],axis=1)
df2
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
|
1 |
2 |
安徽 |
1 |
2 |
湖南 |
5 |
6 |
广东 |
9 |
10 |
河南 |
13 |
14 |
obj['b':'c']
b 1
c 2
dtype: int32
3 Series中的数据运算
s1 = Series([7.3,-2.5,3.4,1.5],index = list('acde'))
s2 = Series([-2.1,3.6,-1.5,4,3.1],index = list('acefg'))
print('s1:\n',s1)
print('s2:\n',s2)
s1:
a 7.3
c -2.5
d 3.4
e 1.5
dtype: float64
s2:
a -2.1
c 3.6
e -1.5
f 4.0
g 3.1
dtype: float64
s1 + s2
a 5.2
c 1.1
d NaN
e 0.0
f NaN
g NaN
dtype: float64
df1 = DataFrame(np.arange(9).reshape(3,3),columns = list('bcd'),
index=['ohio','texas','Colorado'])
df2 = DataFrame(np.arange(12).reshape(4,3),columns = list('bde'),
index=['utah','ohio','texas','Oregon'])
df1
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
|
b |
c |
d |
ohio |
0 |
1 |
2 |
texas |
3 |
4 |
5 |
Colorado |
6 |
7 |
8 |
df2
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
|
b |
d |
e |
utah |
0 |
1 |
2 |
ohio |
3 |
4 |
5 |
texas |
6 |
7 |
8 |
Oregon |
9 |
10 |
11 |
df1 + df2
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
|
b |
c |
d |
e |
Colorado |
NaN |
NaN |
NaN |
NaN |
Oregon |
NaN |
NaN |
NaN |
NaN |
ohio |
3.0 |
NaN |
6.0 |
NaN |
texas |
9.0 |
NaN |
12.0 |
NaN |
utah |
NaN |
NaN |
NaN |
NaN |