【机器学习】python有用工具函数汇总---永久追加更新

1.等差数列样本集

1.1代码

#等差数列
import numpy
x = numpy.arange(start=1, stop=10, step=2, dtype=int)
print(type(x))
print(x)

#等差数列
x = numpy.arange(start=1, stop=10, step=2, dtype=float)
print(type(x))
print(x)

1.2结果


2.聚类数据样本集

2.1代码

    详见: sklearn.datasets.make_blobs

2.2结果


3.sigmoid函数绘图

3.1代码

第一种方法:

    numpy.frompyfunc(func, fin, fout)

第二种方法:

    for e in x:

        list.append()

# -*- coding: utf-8 -*-
"""
@author: tom
Talk is cheap, show me the code
"""

import numpy
import math
import matplotlib.pyplot as plt

#sigmoid function
def sigmoid(x):
    return 1.0/(1.0 + math.exp(-1 * x))
#way1:sigmoid function for narray/list
def sigmoid_func(X):
    sigmoid_function = numpy.frompyfunc(sigmoid, 1, 1)
    return sigmoid_function(X)

x = numpy.arange(-10, 10, 0.1)
y = sigmoid_func(x)
plt.figure(1)
plt.subplot(211)
plt.title('sigmoid function way1')
plt.plot(x, y, c='red')
plt.xlabel('X')
plt.ylabel('Y')
plt.show()


#way2:sigmoid function for narray/list
def sigmoid_way2(X):
    a = []
    for item in X:
        a.append(1.0/(1.0 + math.exp(-item)))
    return a
x = numpy.arange(-10, 10, 0.1)
y = sigmoid_way2(x)

plt.subplot(212)
plt.title('sigmoid function way2')
plt.plot(x, y, c='blue')
plt.xlabel('X')
plt.ylabel('Y')
plt.show()

3.2结果



4.各个元素小数点后都留3位

4.1代码

# -*- coding: utf-8 -*-
"""
@author: tom
Talk is cheap, show me the code
"""
import numpy

def _float(x):
    return float('%.3f' % x)
def float_func(X):
    func = numpy.frompyfunc(_float, 1, 1)
    return func(X)

if __name__=='__main__':
    x = [1.51111111, 2.511111, 3.51111, 4.51111]
    y = float_func(x)
    print(y)

4.2结果


5.每个元素依次开根号

5.1代码

# -*- coding: utf-8 -*-
"""
@author: tom
Talk is cheap, show me the code
"""

import numpy
import math
import matplotlib.pyplot as plt

def _sqrt(x):
    return math.sqrt(x)
#sqrt func for array/list
def sqrt_func(X):
    func = numpy.frompyfunc(_sqrt, 1, 1)
    return func(X)

def _float(x):
    return float('%.3f' % x)
def float_func(X):
    func = numpy.frompyfunc(_float, 1, 1)
    return func(X)
    
if __name__=='__main__':
    x = [1.5, 2, 2.5, 3]
    y = sqrt_func(x)
    y = float_func(y)
    
    plt.figure(1, figsize=(8,6))
    plt.title('y=sqrt(x)')
    plt.xlabel('x')
    plt.ylabel('y')
    plt.scatter(x, y)
    for xy in zip(x,y):
        plt.annotate("(%s, %s)" % xy, xy=xy, xytext=(-20, 10), textcoords='offset points')
    plt.show()

5.2结果


绘图函数参考:https://blog.csdn.net/ialexanderi/article/details/74567336

6.用词典来统计每个元素的次数

6.1代码

# -*- coding: utf-8 -*-
"""
@author: Tom
Talk is cheap, show me the code
Aim:用字典对总统候选的投票进行统计
"""

import numpy as np

#得到标签个数的统计字典,字典元素为:"A":个数
def counter(X):
    counter_dict = {}
    for i in range(len(X)):
        key = X[i]
        value = counter_dict.get(key, 0) + 1
        counter_dict[X[i]] = value
    return counter_dict
            
if __name__=='__main__':
    #总统选举的投票数据集
    label_list = ['A','A','B','B','C','A','B','B','A','A','A','C','C']

    #计算每个候选人的投票统计
    counter_dict = counter(label_list)
    print(counter_dict)
    

6.2结果


7.用词典找出出现次数最多者

7.1代码

# -*- coding: utf-8 -*-
"""
@author: Tom
Talk is cheap, show me the code
Aim:找到数列中次数最多的一个数,请使用python的字典完成
"""

import numpy as np

#得到标签个数的统计字典,字典元素为:"A":个数
def counter(X):
    counter_dict = {}
    for i in range(len(X)):
        key = X[i]
        value = counter_dict.get(key, 0) + 1
        counter_dict[X[i]] = value
    return counter_dict

def max_cnt_label(counter_dict):
    max_cnt = 0
    winner = ''
    for key, value in counter_dict.items():
        if value > max_cnt:
            max_cnt = value
            winner = key
    return winner
            
if __name__=='__main__':
    #总统选举的投票数据集
    label_list = ['A','A','B','B','C','A','B','B','A','A','A','C','C']

    #计算每个候选人的投票统计
    counter_dict = counter(label_list)
    print(counter_dict)
    
    #选出候选胜利者(即得到选票最多的候选者)
    winner = max_cnt_label(counter_dict)
    print(winner)

7.2结果


8.numpy.tile(A, reps):以A为扩展单元进行行/列扩展

有一个测试数据点(1,2),需要和n个样本进行同样的一个数学表达式操作。

这时候需要生成n样本的测试数据集,每个测试数据都是(1,2),这个需求可以用numpy.tile()函数来完成~~

8.1代码

# -*- coding: utf-8 -*-
"""
@author: Tom
Talk is cheap, show me the code
Aim:numpy.tile(A, reps):以A为扩展单元进行行/列扩展
"""

import numpy as np

if __name__=='__main__':
    #case 1,只增加列数
    x1_list = [1,2]
    y1_array = np.tile(A=x1_list, reps=2)
    print(x1_list)
    print(y1_array)
    
    x2_array = np.array([1,2,3])
    y2_array = np.tile(A=x2_array, reps=2)
    print(x2_array)
    print(y2_array)
    
    x3_m = np.mat([1,2])
    y3_m = np.tile(A=x3_m, reps=2)
    print(x3_m)
    print(y3_m)
    
    #case 2,只增加行数
    x4_list = [1,2]
    y4_array = np.tile(A=x4_list, reps=(5,1))
    print(y4_array)
    
    #case 3,增加行数 同时 增加列数
    x5_list = [1,2]
    y5_array = np.tile(A=x5_list, reps=(5,2))
    

8.2结果

开始的需求,就是上面的y4_array,详细结果如下:


9.从txt文件提取data和feature

源数据文件.txt,内容如下:

40920	8.326976	0.953952	3
14488	7.153469	1.673904	2
26052	1.441871	0.805124	1
75136	13.147394	0.428964	1
38344	1.669788	0.134296	1
72993	10.141740	1.032955	1

9.1代码

# -*- coding: utf-8 -*-
"""
@author: Tom
Talk is cheap, show me the code
"""

import numpy as np

file_name = 'D:\\tom\\data\\file_2_matrix.txt'

'''
文件每行内部的数值之间用tab键隔开,如下所示,3个特征数值,1个标签数值:
40920	8.326976  0.953952	3
14488	7.153469  1.673904	2
26052	1.441871  0.805124	1
75136	13.147394 0.428964	1
38344	1.669788  0.134296	1
72993	10.141740 1.032955	1
'''
def file2matrix_way1(file_name, n_features, split_str):
    #open file
    f = open(file_name)
    #get total number of lines in the file
    n_datas = len(f.readlines())
    f.close()
    #准备好要返回的变量
    ret_data_a = np.zeros(shape=(n_datas, n_features))
    ret_label_list = []
    #reopen file
    f = open(file_name)
    data_n = 0
    for data_n in range(n_datas):
        line_list = f.readline().strip().split(split_str)
        ret_data_a[data_n, :] = line_list[0:3]
        ret_label_list.append(line_list[-1])
    return ret_data_a, ret_label_list

def file2matrix_way2(file_name, n_features, split_str):
    #open file
    f = open(file_name)
    #get total number of lines in the file
    n_datas = len(f.readlines())
    f.close()
    #准备好要返回的变量
    ret_data_a = np.zeros(shape=(n_datas, n_features))
    ret_label_list = []
    #reopen file
    f = open(file_name)
    line_n = 0
    for line_text in f.readlines():
        line_list = line_text.strip().split(split_str)
        ret_data_a[line_n, :] = line_list[0:3]
        ret_label_list.append(line_list[-1])
        line_n += 1
    return ret_data_a, ret_label_list

if __name__=='__main__':
    data_a_way1, label_list_way1 = file2matrix_way1(file_name, n_features=3, split_str='\t')
    data_a_way2, label_list_way2 = file2matrix_way2(file_name, n_features=3, split_str='\t')

9.2结果


10.将img.txt表示的图像,转储为array

源文件img.txt:

00000000000001100000000000000000
00000000000011111100000000000000
00000000000111111111000000000000
00000000011111111111000000000000
00000001111111111111100000000000
00000000111111100011110000000000
00000001111110000001110000000000
00000001111110000001110000000000
00000011111100000001110000000000
00000011111100000001111000000000
00000011111100000000011100000000
00000011111100000000011100000000
00000011111000000000001110000000
00000011111000000000001110000000
00000001111100000000000111000000
00000001111100000000000111000000
00000001111100000000000111000000
00000011111000000000000111000000
00000011111000000000000111000000
00000000111100000000000011100000
00000000111100000000000111100000
00000000111100000000000111100000
00000000111100000000001111100000
00000000011110000000000111110000
00000000011111000000001111100000
00000000011111000000011111100000
00000000011111000000111111000000
00000000011111100011111111000000
00000000000111111111111110000000
00000000000111111111111100000000
00000000000011111111110000000000
00000000000000111110000000000000

10.1代码

# -*- coding: utf-8 -*-
"""
@author: Tom
Talk is cheap, show me the code
Aim:将.txt表示的图像,转储为array
"""

import numpy as np

file_name = 'D:\\tom\\data\\img.txt'

def img_2_array(file_name):
    ret_img_a = np.zeros((1,1024)) #32 * 32 = 1024
    f = open(file_name)
    line_list = f.readlines()
    line_cnt = len(line_list)
    cnt = 0
    for i in range(line_cnt):
        line_str = line_list[i]
        #len()函数把'\n'也计算在内,真是字符总数要减去1
        line_str_cnt = len(line_str) - 1 
        for j in range(line_str_cnt):
            ret_img_a[0, cnt+j] = int(line_str[j])
        cnt += line_str_cnt
    return ret_img_a

if __name__=='__main__':
    img_a = img_2_array(file_name)
       

10.2结果


11.计算一个随机变量的香农熵

一个随机变量的香农熵基本概念如下:

熵是表示随机标量不确定性的度量,熵值越大,随机变量的不确定性也越大。

设X是一个取有限个值得离散随机变量,其概率分布为:

            

则随机变量的熵定义为:

            

11.1代码

# -*- coding: utf-8 -*-
"""
@author: 蔚蓝的天空Tom
Talk is cheap, show me the code
Aim:计算香农熵,支持向量化计算
"""

import numpy as np
import math

#math.log(x) 就是数学表达式ln(x),底数为e的对数
#math.log10(x) 就是底数为10的对数
#香农熵越大,随机变量的不确定性就越大

#-p*log(p)函数
def shannon_entropy_ele(p):
    print('\nlog',p,'=', math.log(p))
    print('-p*log(p)=', (-1)*p*math.log(p))
    return (-1) * p * math.log(p) #p是数值表达式
#-p*log(p)函数的向量化
def shannon_entropy_ele_func(P):
    func = np.frompyfunc(shannon_entropy_ele, 1, 1)
    return func(P)
#香农熵函数的向量化
def shannon_entropy_func(P):
    ele_a = shannon_entropy_ele_func(P)
    shannon_entropy = sum(ele_a)
    return shannon_entropy

if __name__=='__main__':
    #随机变量集合
    X = [1,2,3,4]
    #随机变量对应的概率
    P = [0.1, 0.2, 0.3, 0.4]
    #计算随机变量的熵
    shannon_entropy = shannon_entropy_func(P)

11.2运行日志结果

runfile('C:/Users/Administrator/calc_shannonEntropy.py', wdir='C:/Users/Administrator')

log 0.1 = -2.3025850929940455
-p*log(p)= 0.23025850929940456

log 0.2 = -1.6094379124341003
-p*log(p)= 0.3218875824868201

log 0.3 = -1.2039728043259361
-p*log(p)= 0.3611918412977808

log 0.4 = -0.916290731874155
-p*log(p)= 0.366516292749662

11.3中间变量值结果


12.两个随机变量x、y的复合熵

关于复合熵的概念可以参看:http://www.cnblogs.com/xiedan/archive/2010/04/03/1703722.html

两个随机变量x、y的复合熵公式为:


其中p(i,j)是xi和yi的联合概率

12.1代码

# -*- coding: utf-8 -*-
"""
@author: 蔚蓝的天空Tom
Talk is cheap, show me the code
Aim:计算两个随机变量x, y的复合熵
"""

import numpy as np
import math

#-p*log(p)函数
def composite_entropy_ele(p):
    return (-1) * p * math.log(p)

#-p*log(p)函数的向量化
def composite_entropy_ele_func(P):
    func = np.frompyfunc(composite_entropy_ele, 1, 1)
    return func(P)

#-∑∑plog(p)条件熵函数的向量化
def composite_entropy_func(P):
    '''ele_array 类型:<class 'numpy.ndarray'>  (3,3)
    [[0.23025850929940456 0.23025850929940456 0.23025850929940456]
     [0.23025850929940456 0.23025850929940456 0.23025850929940456]
     [0.23025850929940456 0.23025850929940456 0.3218875824868201]]
    '''
    ele_array = composite_entropy_ele_func(P)

    #整个数组求和
    '''
    2.1639556568820564
    '''
    composite_entropy = ele_array.sum()

    #返回条件熵
    return composite_entropy

if __name__=='__main__':
    #两个离散随机变量x,y的联合概率p(x,y)分布表P(x, y) <class 'numpy.ndarray'>  (3,3)
    P = [[0.1, 0.1, 0.1], 
         [0.1, 0.1, 0.1], 
         [0.1, 0.1, 0.2]]
    
    #根据两个离散随机变量的概率分布P(x, y)计算随机变量x、y的复合熵
    composite_entropy = composite_entropy_func(P)

12.2结果


enjoy it~

(end)

猜你喜欢

转载自blog.csdn.net/u012421852/article/details/79734627
今日推荐