一、numpy
##########numpy提供了进行数组和矩阵的运算的方法
import numpy as np
a=[1,2,3,4]
b=np.array(a)
########创建简单数组
b.size
b.shape
b.ndim #######维度
b.dtype
array_one = np.ones([10,10]) ######创建10行10列的1的数组
array_one
array_zero = np.zeros([10,10])
array_zero
ones = np.array(array_one) #### 数组的浅复制
ones
one2 = np.asarray(array_one) ####数组的深复制
one2
######随机数组的创建
np.random.rand(10,10) ##创建10行10列1以内的数组
np.random.randint(0,100) ####得到指定范围的一个整数
np.random.uniform(0,100) #####得到指定范围的一个浮点数
np.random.normal(1.75,0.1,(2,3)) ####指定数组的均值,方差和维度来建立一个数组
arr = np.random.normal(1.75,0.1,(4,5))
print(arr)
after_arr = arr[1:3,2:4] ####二维数组的索引切片
print(after_arr)
one_20 = np.ones([20])
print(one_20)
after_one20 = one_20.reshape(4,5) ###数组重新改变形状,就是改变维度
print(after_one20)
########numpy的计算 重要
stus_score=np.array([[80, 88], [82, 81], [84, 75], [86, 83], [75, 81]])
print(stus_score)
####条件计算
stus_score>80
np.where(stus_score<80,0,90)
####计算指定行列的最大最小值等(统计运算)
result=np.amax(stus_score,axis=0)
print(result)
result=np.amax(stus_score,axis=1)
print(result)
print(np.mean(stus_score,axis=1))
print(np.amin(stus_score,axis=1))
print(np.std(stus_score,axis=0))
####数组运算
stus_score=np.array([[80, 88], [82, 81], [84, 75], [86, 83], [75, 81]])
print(stus_score)
stus_score[:,0]=stus_score[:,0]+5 #######数组的加法
print(stus_score)
stus_score[:,0]=stus_score[:,0]*0.5 ####乘法
print(stus_score)
#######矩阵运算 重要
stus_score=np.array([[80, 88], [82, 81], [84, 75], [86, 83], [75, 81]])
q = np.array([[0.4],[0.6]])
result=np.dot(stus_score,q) #####根据加权计算一个学生的成绩
print(result)
print("v1为:")
v1 = [[0, 1, 2, 3, 4, 5],
[6, 7, 8, 9, 10, 11]]
print(v1)
print("v2为:")
v2 = [[12, 13, 14, 15, 16, 17],
[18, 19, 20, 21, 22, 23]]
print(v2)
result=np.vstack((v1,v2)) #####矩阵的垂直拼接
print(result)
print(np.hstack((v1,v2))) ##3矩阵的水平拼接
result=np.genfromtxt('2.csv',delimiter=',') ###从csv文件中读取数据
print(result)
二、pandas
######Pandas是基于Numpy开发出的,专门用于数据分析的开源Python库
import pandas as pd
import numpy as np
######Pandas的两大核心数据结构
####Series(一维数据)
pd.Series(np.arange(4,10)) ###使用numpy数组创建
pd.Series([10,20,30],index=['beijing','beijing','shanghai']) ####通过python数组创建,索引可以重复
pd.Series({'haha':10,'keke':20,'kk':70}) #####通过python字典创建
######DataFrame(多特征数据,既有行索引,又有列索引)
data34 = pd.DataFrame(np.arange(10,22).reshape(3,4))
print(data34)
print(data34[0:1]) #####截取第一行
print(data34[0]) ###取第一列
##########dataframe属性与数据的读取
result = pd.read_csv('222.csv',error_bad_lines=False)
print(result)
#result.shape
# result.dtypes
# result.ndim
# result.index
# result.columns
result.values
########整体查询
print(result.head(5))
print(result.tail(5))
print(result['age'].describe())
print(result['name'][0:5])
print(type(result))
print(result[result['age']>30])#####数据过滤
#####一些基本的分析操作
result.sort_values(by='age',ascending=False)
result[result['age']== result['age'].max()]
result[result['age']==result['age'].min()]
result['age'].mean()
#############数据的处理
result = pd.read_csv('222.csv',error_bad_lines=False)
print(result)
result.dropna();
print(result)
result['age'].fillna(0)####填充空值
print(result)
result=result.dropna()#####去掉空值
print(result)
#小案例: 乳腺癌数据预处理 (在线获取数据,并替换缺失符号为标准缺失符号np.nan)
# 在线读取数据,并按照说明文档, 并对各列信息进行命名
bcw = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data", names=["Sample code number","Clump Thickness","Uniformity of Cell Size","Uniformity of Cell Shape", "Marginal Adhesion","Single Epithelial Cell Size","Bare Nuclei","Bland Chromatin","Normal Nucleoli","Mitoses","Class:"])
print(bcw.head(50))
bcw.replace(to_replace='?',value=np.nan)
###########小案例: 日期格式转换
testtime = pd.read_csv('time.csv',nrows=10)
testtime
testtime['time'] = pd.to_datetime(testtime['time'],unit='s')
testtime
testtime['year'] = pd.DatetimeIndex(testtime['time']).year
testtime['month']=pd.DatetimeIndex(testtime['time']).month
testtime['day']=pd.DatetimeIndex(testtime['time']).weekday
testtime
#####数据表的合并
user = pd.read_csv('csv/user.csv')
order = pd.read_csv('csv/order.csv')
goods = pd.read_csv('csv/goods.csv')
user
order
goods
uo = pd.merge(user,order,how='left',on=['user_id','user_id']) ######类似于数据库中表的关联查询
uo
uog = pd.merge(uo,goods,how='left',on=['goods_name','goods_name'])
uog
user_goods = pd.crosstab(uog["姓名"],uog["goods_name"]) ####建立交叉表
user_goods
####################
###pandas的分组和聚合(重要)
####################
uog.groupby(['age','user_id']).count()
三、matplotlib
###############
#折线图
###############
import matplotlib.pyplot as plt
import random
# 保证生成的图片在浏览器内显示
%matplotlib inline
# 保证能正常显示中文(Mac)
plt.rcParams['font.family'] = ['Arial Unicode MS']
# 模拟海南一天的温度变化
# 生成x轴的24小时
hainan_x = [h for h in range(0, 24)]
# 生成y轴的温度随机值(15, 25)
hainan_y = [random.randint(15, 25) for t in range(0, 24)]
# 设置画板属性
plt.figure(figsize = (10, 8), dpi = 100)
# 往画板绘图
plt.plot(hainan_x, hainan_y, label="海南")
# 模拟北京一天内温度的变化
# 生成x轴的24小时
beijing_x = [h for h in range(0, 24)]
# 生成y轴的温度随机值(5, 10)
beijing_y = [random.randint(5, 10) for t in range(0, 24)]
# 往画板绘图
plt.plot(beijing_x, beijing_y, label="北京")
# 模拟河北一天内温度的变化
hebei_x = beijing_x
hebei_y = [random.randint(1, 5) for t in range(0, 24)]
# 自定义绘制属性: 颜色color="#0c8ac5", linestyle"-"""--""-.":", 线宽linewidth, 透明度alpha
plt.plot(hebei_x, hebei_y, label="河北",color="#823384", linestyle=":", linewidth=3, alpha=0.3)
# 坐标轴显示设置
# 生成24小时的描述
x_ = [x_ for x_ in range(0, 24)]
x_desc = ["{}时".format(x_desc) for x_desc in x_]
# 设置x轴显示 24小时
plt.xticks(x_, x_desc)
# 生成10至30度的描述
y_ = [y_ for y_ in range(0, 30)][::2]
y_desc = ["{}℃".format(y_desc) for y_desc in y_]
# 设置y轴显示温度描述
plt.yticks(y_, y_desc)
# 指定x y轴的名称
plt.xlabel("时间")
plt.ylabel("温度")
# 指定标题
plt.title("一天内温度的变化")
# 显示图例
plt.legend(loc="best")
# 将数据生成图片, 保存到当前目录下
plt.savefig("./t.png")
# 在浏览器内展示图片
plt.show()
################
###条形图
#################
import matplotlib.pyplot as plt
import random
# 保证生成的图片在浏览器内显示
%matplotlib inline
# 保证能正常显示中文(Mac)
plt.rcParams['font.family'] = ['Arial Unicode MS']
# 条形图绘制名侦探柯南主要角色年龄
role_list = ["柯南", "毛利兰", "灰原哀", "琴酒","贝尔摩德", "伏特加", "赤井秀一", "目暮十三"]
role_age = [7, 17, 7, 34, 32, 30, 27, 46]
# 实际年龄
role_ture_age = [18, 17, 18, 34, 45, 30, 27, 46]
x = [i for i in range(1, len(role_list)+1)]
x2 = [i+0.3 for i in range(1, len(role_list)+1)]
y = role_age
y2 =role_ture_age
# 设置画板属性
plt.figure(figsize = (15, 8), dpi = 100)
# width以x为基准,向右为正,向左为负(如果多了,就需要为基准x加减响应的数值)
plt.bar(x, y, width= -0.3, label="现实年龄", color="#509839")
plt.bar(x2, y2, width = 0.3, label="实际年龄", color="#c03035")
x_ = [i for i in range(0, len(role_list)+1)]
x_desc = ["{}".format(x_desc) for x_desc in role_list]
x_desc.insert(0, "")
y_ = range(0, 50)[::5]
y_desc = ["{}岁".format(y_desc) for y_desc in range(0, 50)][::5]
# x轴的数值和描述
plt.xticks(x_, x_desc)
plt.yticks(y_, y_desc)
plt.xlabel("角色姓名")
plt.ylabel("年龄")
plt.title("名侦探柯南主要角色年龄(部分)")
plt.legend(loc="best")
plt.savefig("./mzt.png")
plt.show()
#################
###直方图
##################
import matplotlib.pyplot as plt
import random
# 保证能正常显示中文
plt.rcParams['font.family'] = ['Arial Unicode MS']
# 时长数据
time = [131, 98, 125, 131, 124, 139, 131, 117, 128, 108, 135, 138, 131, 102, 107, 114, 119, 128, 121, 142, 127, 130, 124, 101, 110, 116, 117, 110, 128, 128, 115, 99, 136, 126, 134, 95, 138, 117, 111,78, 132, 124, 113, 150, 110, 117, 86, 95, 144, 105, 126, 130,126, 130, 126, 116, 123, 106, 112, 138, 123, 86, 101, 99, 136,123, 117, 119, 105, 137, 123, 128, 125, 104, 109, 134, 125, 127,105, 120, 107, 129, 116, 108, 132, 103, 136, 118, 102, 120, 114,105, 115, 132, 145, 119, 121, 112, 139, 125, 138, 109, 132, 134,156, 106, 117, 127, 144, 139, 139, 119, 140, 83, 110, 102,123,107, 143, 115, 136, 118, 139, 123, 112, 118, 125, 109, 119, 133,112, 114, 122, 109, 106, 123, 116, 131, 127, 115, 118, 112, 135,115, 146, 137, 116, 103, 144, 83, 123, 111, 110, 111, 100, 154,136, 100, 118, 119, 133, 134, 106, 129, 126, 110, 111, 109, 141,120, 117, 106, 149, 122, 122, 110, 118, 127, 121, 114, 125, 126,114, 140, 103, 130, 141, 117, 106, 114, 121, 114, 133, 137, 92,121, 112, 146, 97, 137, 105, 98, 117, 112, 81, 97, 139, 113,134, 106, 144, 110, 137, 137, 111, 104, 117, 100, 111, 101, 110,105, 129, 137, 112, 120, 113, 133, 112, 83, 94, 146, 133, 101,131, 116, 111, 84, 137, 115, 122, 106, 144, 109, 123, 116, 111,111, 133, 150]
max_time = max(time)
min_time = min(time)
# 指定分组宽度
width = 10
# 指定分组数量
num_bins = int((max_time - min_time)/3)
# 直方图统计电影时长频数
plt.figure(figsize=(15, 8), dpi=80)
# 绘制直方图
plt.hist(time, num_bins, color="#509839",normed=1)
# 指定显示刻度的个数
x_ = [i for i in range(min_time, max_time+1)]
plt.xticks(x_[::width])
# 显示网格
plt.grid(True, linestyle="--", alpha=0.5)
# 指定标题
plt.title("Top250的IMDB电影时长统计")
plt.savefig("./IMDB.png")
plt.show()
####################
####饼图
#####################
import matplotlib.pyplot as plt
import random
# 保证能正常显示中文(Mac)
plt.rcParams['font.family'] = ['Arial Unicode MS']
plt.figure(figsize=(15, 8), dpi=80)
# 学习时间分配
pro_name = ["C++", "Python", "Java", "Go", "Swift"]
pro_time = [10, 15, 5, 3, 1]
# 画饼
plt.pie(pro_time, labels=pro_name, autopct="%3.2f%%", colors=["#ea6f5a", "#509839", "#0c8ac5", "#d29922", "#fdf6e3"])
# 指定标题
plt.title("学习时间分配")
# 保证为图形为正圆
plt.axis("equal")
# 显示图示
plt.legend(loc="best")
plt.savefig("./pro_learn.png")
plt.show()