2.9 - 3.2 pandas - 代码天地

2.9 结构化数据

import numpy as np

# 使用符合数据结构的结构化数组

data = np.zeros(4, dtype={'names': ('name', 'age', 'weight'),

                         'formats': ('U10', 'i4', 'f8')})

data.dtype

dtype([('name', '<U10'), ('age', '<i4'), ('weight', '<f8')])

# 填入数据

name = ['Alice', 'Bob', 'Cathy', 'Doug']

age = [25, 45, 34, 19]

weight = [55.0, 86.5, 68, 61.5]

data['name'] = name

data['age'] = age

data['weight'] = weight

data

array([('Alice', 25, 55. ), ('Bob', 45, 86.5), ('Cathy', 34, 68. ),
       ('Doug', 19, 61.5)],
      dtype=[('name', '<U10'), ('age', '<i4'), ('weight', '<f8')])

# 访问

# 获取所有名字

data['name']

array(['Alice', 'Bob', 'Cathy', 'Doug'], dtype='<U10')

# 获取第一行

data[0]

('Alice', 25, 55.)

# 获取最后一行的名字

data[-1]['name']

'Doug'

# 利用掩码筛选

# 年龄小于30的数据行的名字字段

data[data['age'] < 30]['name']

array(['Alice', 'Doug'], dtype='<U10')

3.2 Pandas 对象

import pandas as pd

pd.__version__

'0.23.3'

Pandas 的 Series 对象

是一个带索引数据构成的一维数组，可用一个数组创建之。

data = pd.Series([0.25, 0.5, 0.75, 1.0])

data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

data.values

array([0.25, 0.5 , 0.75, 1.  ])

data.index

RangeIndex(start=0, stop=4, step=1)

# 索引访问

data[1]

0.5

data[1:3]

1    0.50
2    0.75
dtype: float64

# 索引可以显示定义

data = pd.Series([0.25, 0.5, 0.75, 1], index=['a', 'b', 'c', 'd'])

data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

# Series 是特殊的字典

# 可以用python的字典创建Series对象

popu_dict={'e': 6622, 'b': 5644, 'c': 9022, 'd': 1222111}

popu = pd.Series(popu_dict)

popu

e       6622
b       5644
c       9022
d    1222111
dtype: int64

# 与一般字典不同，Series支持切片

popu['b':'d']

b       5644
c       9022
d    1222111
dtype: int64

# 创建Series对象的其他方法

# 值为标量，填充到索引

pd.Series(5, index=[100, 200, 10])

100    5
200    5
10     5
dtype: int64

# 值为字典，索引默认为排序的字典键

pd.Series({2: 'a', 1: 'b', 3: 'c'})

2    a
1    b
3    c
dtype: object

# 每种形式都可以显示指定索引，从而筛选需要的结果

pd.Series({2: 'a', 1: 'b', 3: 'c'}, index=[3, 2])

3    c
2    a
dtype: object

Pandas 的 DataFrame 对象

既可以作为通用的 Numpy 数组，又可以作为特殊的 Python 字典来看待

# DF 是既有灵活行索引，又有灵活列名的二维数组

area_dict = {'e': 50, 'b': 46, 'c': 66, 'd': 211}

area = pd.Series(area_dict)

area

e     50
b     46
c     66
d    211
dtype: int64

# 结合上边的popu和area创建一个DF对象

# 用字典作为参数创建

states = pd.DataFrame({'popu': popu, 'area': area})

states

	popu	area
e	6622	50
b	5644	46
c	9022	66
d	1222111	211

# DF的行索引

states.index

Index(['e', 'b', 'c', 'd'], dtype='object')

# DF的列名

states.columns

Index(['popu', 'area'], dtype='object')

# DF可看作字典，用列名当索引，返回此列的字典键值对

states['area']

e     50
b     46
c     66
d    211
Name: area, dtype: int64

创建DF对象的多个方法：

# 1 通过单个Series对象创建单列DF

pd.DataFrame(popu, columns=['population'])

	population
e	6622
b	5644
c	9022
d	1222111

# 2 通过字典列表创建。任何元素是字典的列表都可以变为DF

# 行索引未指定，默认为整数

data = [{'a': i+5, 'b': 2*(i+5)} for i in range(3)]

data

[{'a': 5, 'b': 10}, {'a': 6, 'b': 12}, {'a': 7, 'b': 14}]

pd.DataFrame(data)

	a	b
0	5	10
1	6	12
2	7	14

# 即使字典有缺少的值，DF也用NaN（不是数字）代替

pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

	a	b	c
0	1.0	2	NaN
1	NaN	3	4.0

# 3 通过Series对象字典创建DF，如前

pd.DataFrame({'population': popu, 'area': area})

	population	area
e	6622	50
b	5644	46
c	9022	66
d	1222111	211

# 4 通过Numpy二维数组创建。

# 如不显示指定行列的索引值，则默认均为整数索引值

pd.DataFrame(np.random.rand(3, 2), columns=['foo', 'bar'], index=['a','b','c'])

	foo	bar
a	0.761191	0.647341
b	0.147634	0.271593
c	0.438096	0.308335

# 5 通过Numpy结构化数组创建。

A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

pd.DataFrame(A)

	A	B
0	0	0.0
1	0	0.0
2	0	0.0

Pandas 的 Index 对象

可看作是不可变数组或有序集合（实为多集，因值可重复）

ind = pd.Index([2, 3, 5, 7, 11])

ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

# 可索引可切片

ind[3]

ind[::2]

Int64Index([2, 5, 11], dtype='int64')

# 属性与np数组相似

print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64

# index对象的值是不可更改的，如下句会出错：

ind[0] = 8

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-41-11ff608b5603> in <module>()
      1 # index对象的值是不可更改的，如下句会出错：
----> 2 ind[0] = 8

c:\program files\python36-32\lib\site-packages\pandas\core\indexes\base.py in __setitem__(self, key, value)
   2063 
   2064     def __setitem__(self, key, value):
-> 2065         raise TypeError("Index does not support mutable operations")
   2066 
   2067     def __getitem__(self, key):

TypeError: Index does not support mutable operations

# Index 对象的集合操作

indA = pd.Index([1, 3, 5, 7, 9])

indB = pd.Index([2, 3, 5, 7, 11])

# 交集

indA & indB

Int64Index([3, 5, 7], dtype='int64')

# 并集

indA | indB

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

# 异或

indA ^ indB

Int64Index([1, 2, 9, 11], dtype='int64')

2.9 - 3.2 pandas

2.9 结构化数据

3.2 Pandas 对象

Pandas 的 Series 对象

Pandas 的 DataFrame 对象

Pandas 的 Index 对象

猜你喜欢