一、numpy

练习代码

##########numpy提供了进行数组和矩阵的运算的方法

import numpy as np
a=[1,2,3,4]
b=np.array(a)

########创建简单数组

b.size

b.shape

b.ndim    #######维度

b.dtype

array_one = np.ones([10,10])    ######创建10行10列的1的数组

array_one

array_zero = np.zeros([10,10])

array_zero

ones = np.array(array_one)      #### 数组的浅复制

ones

one2 = np.asarray(array_one)           ####数组的深复制

one2

######随机数组的创建

np.random.rand(10,10)    ##创建10行10列1以内的数组

np.random.randint(0,100)  ####得到指定范围的一个整数

np.random.uniform(0,100)   #####得到指定范围的一个浮点数

np.random.normal(1.75,0.1,(2,3)) ####指定数组的均值，方差和维度来建立一个数组

arr = np.random.normal(1.75,0.1,(4,5))

print(arr)

after_arr = arr[1:3,2:4]        ####二维数组的索引切片

print(after_arr)

one_20 = np.ones([20])

print(one_20)

after_one20 = one_20.reshape(4,5)  ###数组重新改变形状，就是改变维度

print(after_one20)

########numpy的计算 重要

stus_score=np.array([[80, 88], [82, 81], [84, 75], [86, 83], [75, 81]])

print(stus_score)

####条件计算

stus_score>80

np.where(stus_score<80,0,90)

####计算指定行列的最大最小值等（统计运算）

result=np.amax(stus_score,axis=0)

print(result)

result=np.amax(stus_score,axis=1)

print(result)

print(np.mean(stus_score,axis=1))

print(np.amin(stus_score,axis=1))

print(np.std(stus_score,axis=0))

####数组运算

stus_score=np.array([[80, 88], [82, 81], [84, 75], [86, 83], [75, 81]])

print(stus_score)

stus_score[:,0]=stus_score[:,0]+5   #######数组的加法

print(stus_score)

stus_score[:,0]=stus_score[:,0]*0.5  ####乘法

print(stus_score)

#######矩阵运算   重要

stus_score=np.array([[80, 88], [82, 81], [84, 75], [86, 83], [75, 81]])

q = np.array([[0.4],[0.6]])

result=np.dot(stus_score,q) #####根据加权计算一个学生的成绩

print(result)

print("v1为:")
v1 = [[0, 1, 2, 3, 4, 5],
      [6, 7, 8, 9, 10, 11]]
print(v1)
print("v2为:")
v2 = [[12, 13, 14, 15, 16, 17], 
      [18, 19, 20, 21, 22, 23]]
print(v2)

result=np.vstack((v1,v2))   #####矩阵的垂直拼接

print(result)

print(np.hstack((v1,v2)))   ##3矩阵的水平拼接

result=np.genfromtxt('2.csv',delimiter=',')   ###从csv文件中读取数据

print(result)

二、pandas

练习代码

######Pandas是基于Numpy开发出的,专门用于数据分析的开源Python库
import pandas as pd
import numpy as np

######Pandas的两大核心数据结构
####Series(一维数据)

pd.Series(np.arange(4,10))      ###使用numpy数组创建

pd.Series([10,20,30],index=['beijing','beijing','shanghai'])  ####通过python数组创建，索引可以重复

pd.Series({'haha':10,'keke':20,'kk':70})     #####通过python字典创建

######DataFrame(多特征数据,既有行索引,又有列索引)

data34 = pd.DataFrame(np.arange(10,22).reshape(3,4))

print(data34)

print(data34[0:1])  #####截取第一行

print(data34[0])  ###取第一列

##########dataframe属性与数据的读取

result = pd.read_csv('222.csv',error_bad_lines=False)

print(result)

#result.shape
# result.dtypes
# result.ndim
# result.index
# result.columns
result.values

########整体查询

print(result.head(5))

print(result.tail(5))

print(result['age'].describe())

print(result['name'][0:5])

print(type(result))

print(result[result['age']>30])#####数据过滤

#####一些基本的分析操作

result.sort_values(by='age',ascending=False)

result[result['age']== result['age'].max()]

result[result['age']==result['age'].min()]

result['age'].mean()

#############数据的处理

result = pd.read_csv('222.csv',error_bad_lines=False)

print(result)

result.dropna();

print(result)

result['age'].fillna(0)####填充空值

print(result)

result=result.dropna()#####去掉空值

print(result)

#小案例: 乳腺癌数据预处理 (在线获取数据,并替换缺失符号为标准缺失符号np.nan)
# 在线读取数据,并按照说明文档, 并对各列信息进行命名
bcw = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data", names=["Sample code number","Clump Thickness","Uniformity of Cell Size","Uniformity of Cell Shape", "Marginal Adhesion","Single Epithelial Cell Size","Bare Nuclei","Bland Chromatin","Normal Nucleoli","Mitoses","Class:"])



print(bcw.head(50))

bcw.replace(to_replace='?',value=np.nan)

###########小案例: 日期格式转换

testtime = pd.read_csv('time.csv',nrows=10)

testtime

testtime['time'] = pd.to_datetime(testtime['time'],unit='s')

testtime

testtime['year'] = pd.DatetimeIndex(testtime['time']).year

testtime['month']=pd.DatetimeIndex(testtime['time']).month

testtime['day']=pd.DatetimeIndex(testtime['time']).weekday

testtime

#####数据表的合并

user = pd.read_csv('csv/user.csv')
order = pd.read_csv('csv/order.csv')
goods = pd.read_csv('csv/goods.csv')

user

order

goods

uo = pd.merge(user,order,how='left',on=['user_id','user_id'])  ######类似于数据库中表的关联查询
uo

uog = pd.merge(uo,goods,how='left',on=['goods_name','goods_name'])
uog

user_goods = pd.crosstab(uog["姓名"],uog["goods_name"])   ####建立交叉表
user_goods

####################

###pandas的分组和聚合（重要）

####################

uog.groupby(['age','user_id']).count()

三、matplotlib

练习代码

###############

#折线图

###############

import matplotlib.pyplot as plt
import random
# 保证生成的图片在浏览器内显示
%matplotlib inline
# 保证能正常显示中文(Mac)
plt.rcParams['font.family'] = ['Arial Unicode MS']

# 模拟海南一天的温度变化

# 生成x轴的24小时
hainan_x = [h for h in range(0, 24)]

# 生成y轴的温度随机值(15, 25)
hainan_y = [random.randint(15, 25) for t in range(0, 24)]

# 设置画板属性
plt.figure(figsize = (10, 8), dpi = 100)

# 往画板绘图
plt.plot(hainan_x, hainan_y, label="海南")

# 模拟北京一天内温度的变化

# 生成x轴的24小时
beijing_x = [h for h in range(0, 24)]

# 生成y轴的温度随机值(5, 10)
beijing_y = [random.randint(5, 10) for t in range(0, 24)]

# 往画板绘图
plt.plot(beijing_x, beijing_y, label="北京")


# 模拟河北一天内温度的变化
hebei_x = beijing_x
hebei_y = [random.randint(1, 5) for t in range(0, 24)]
# 自定义绘制属性: 颜色color="#0c8ac5", linestyle"-"""--""-.":", 线宽linewidth, 透明度alpha
plt.plot(hebei_x, hebei_y, label="河北",color="#823384", linestyle=":", linewidth=3, alpha=0.3)


# 坐标轴显示设置



# 生成24小时的描述
x_ = [x_ for x_ in range(0, 24)]
x_desc = ["{}时".format(x_desc) for x_desc in x_]

# 设置x轴显示 24小时
plt.xticks(x_, x_desc)

# 生成10至30度的描述
y_ = [y_ for y_ in range(0, 30)][::2]
y_desc = ["{}℃".format(y_desc) for y_desc in y_]


# 设置y轴显示温度描述
plt.yticks(y_, y_desc)

# 指定x y轴的名称
plt.xlabel("时间")
plt.ylabel("温度")

# 指定标题
plt.title("一天内温度的变化")

# 显示图例
plt.legend(loc="best")
 
# 将数据生成图片, 保存到当前目录下
plt.savefig("./t.png")
# 在浏览器内展示图片
plt.show()

################

###条形图

#################

import matplotlib.pyplot as plt
import random
# 保证生成的图片在浏览器内显示
%matplotlib inline
# 保证能正常显示中文(Mac)
plt.rcParams['font.family'] = ['Arial Unicode MS']

# 条形图绘制名侦探柯南主要角色年龄
role_list = ["柯南", "毛利兰", "灰原哀", "琴酒","贝尔摩德", "伏特加", "赤井秀一", "目暮十三"]
role_age = [7, 17, 7, 34, 32, 30, 27, 46]
# 实际年龄
role_ture_age = [18, 17, 18, 34, 45, 30, 27, 46]

x = [i for i in range(1, len(role_list)+1)]
x2 = [i+0.3 for i in range(1, len(role_list)+1)]

y = role_age
y2 =role_ture_age

# 设置画板属性
plt.figure(figsize = (15, 8), dpi = 100)

# width以x为基准,向右为正,向左为负(如果多了,就需要为基准x加减响应的数值)
plt.bar(x, y, width= -0.3, label="现实年龄", color="#509839")
plt.bar(x2, y2, width = 0.3, label="实际年龄", color="#c03035")

x_ = [i for i in range(0, len(role_list)+1)]
x_desc = ["{}".format(x_desc) for x_desc in role_list]
x_desc.insert(0, "")

y_ = range(0, 50)[::5]
y_desc = ["{}岁".format(y_desc) for y_desc in range(0, 50)][::5]

# x轴的数值和描述
plt.xticks(x_, x_desc)
plt.yticks(y_, y_desc)

plt.xlabel("角色姓名")
plt.ylabel("年龄")
plt.title("名侦探柯南主要角色年龄(部分)")
plt.legend(loc="best")
plt.savefig("./mzt.png")
plt.show()

#################

###直方图

##################

import matplotlib.pyplot as plt
import random

# 保证能正常显示中文
plt.rcParams['font.family'] = ['Arial Unicode MS']

# 时长数据
time = [131,  98, 125, 131, 124, 139, 131, 117, 128, 108, 135, 138, 131, 102, 107, 114, 119, 128, 121, 142, 127, 130, 124, 101, 110, 116, 117, 110, 128, 128, 115,  99, 136, 126, 134,  95, 138, 117, 111,78, 132, 124, 113, 150, 110, 117,  86,  95, 144, 105, 126, 130,126, 130, 126, 116, 123, 106, 112, 138, 123,  86, 101,  99, 136,123, 117, 119, 105, 137, 123, 128, 125, 104, 109, 134, 125, 127,105, 120, 107, 129, 116, 108, 132, 103, 136, 118, 102, 120, 114,105, 115, 132, 145, 119, 121, 112, 139, 125, 138, 109, 132, 134,156, 106, 117, 127, 144, 139, 139, 119, 140,  83, 110, 102,123,107, 143, 115, 136, 118, 139, 123, 112, 118, 125, 109, 119, 133,112, 114, 122, 109, 106, 123, 116, 131, 127, 115, 118, 112, 135,115, 146, 137, 116, 103, 144,  83, 123, 111, 110, 111, 100, 154,136, 100, 118, 119, 133, 134, 106, 129, 126, 110, 111, 109, 141,120, 117, 106, 149, 122, 122, 110, 118, 127, 121, 114, 125, 126,114, 140, 103, 130, 141, 117, 106, 114, 121, 114, 133, 137,  92,121, 112, 146,  97, 137, 105,  98, 117, 112,  81,  97, 139, 113,134, 106, 144, 110, 137, 137, 111, 104, 117, 100, 111, 101, 110,105, 129, 137, 112, 120, 113, 133, 112,  83,  94, 146, 133, 101,131, 116, 111,  84, 137, 115, 122, 106, 144, 109, 123, 116, 111,111, 133, 150]
max_time = max(time)
min_time = min(time)
# 指定分组宽度
width = 10
# 指定分组数量
num_bins = int((max_time - min_time)/3)
# 直方图统计电影时长频数
plt.figure(figsize=(15, 8), dpi=80)

# 绘制直方图
plt.hist(time, num_bins, color="#509839",normed=1)

# 指定显示刻度的个数 
x_ = [i for i in range(min_time, max_time+1)]
plt.xticks(x_[::width])

# 显示网格
plt.grid(True, linestyle="--", alpha=0.5)

# 指定标题
plt.title("Top250的IMDB电影时长统计")
plt.savefig("./IMDB.png")
plt.show()

####################

####饼图

#####################


import matplotlib.pyplot as plt
import random

# 保证能正常显示中文(Mac)
plt.rcParams['font.family'] = ['Arial Unicode MS']
plt.figure(figsize=(15, 8), dpi=80)

# 学习时间分配
pro_name = ["C++", "Python", "Java", "Go", "Swift"]
pro_time = [10, 15, 5, 3, 1]

# 画饼
plt.pie(pro_time, labels=pro_name, autopct="%3.2f%%", colors=["#ea6f5a", "#509839", "#0c8ac5", "#d29922", "#fdf6e3"])

# 指定标题
plt.title("学习时间分配")

# 保证为图形为正圆
plt.axis("equal")

# 显示图示
plt.legend(loc="best")
plt.savefig("./pro_learn.png")
plt.show()

PYTHON拓展（numpy | pandas | matplotlib）

一、numpy

二、pandas

三、matplotlib

猜你喜欢