Eigen, NumCpp, NumPy, CuPy计算时间成本对比

对于矩阵运算，著名的C++库有Eigen与C++版本的Numpy——NumCpp，Python库有NumPy与带GPU加速的Numpy版本CuPy。博客园|Eigen vs Numpy时间对比中对Eigen与Numpy的效率进行了比较，但是比较时忽略了C++的编译模式[DeBug与Release]对结果的影响，对比出的结果是不够准确的。本篇对Eigen、NumCpp、NumPy以及CuPy四种主流矩阵运算库的计算效率进行比较，以备实战时作为考量的一点依据。

文章目录

一、测试项目

本博客主要测试四种库对静态矩阵与动态矩阵运算的效率，主要分为两个项目：一）静态矩阵计算；二）动态矩阵计算。

二、静态矩阵计算效率测试

2.1 测试数据与函数

假定有向量集 $D=\{\vec{d}_{1}\in R^{M\times 1},...,\vec{d}_{N}\in R^{M\times 1}\}$ 与向量 $\vec{x}\in R^{M\times 1}$ ，测试计算以下函数为：

$y_{i}=\exp(\frac{-||\vec{x}-\vec{d}_{i}||^{2}}{2\delta^2})，i=1,2,...,2000$

测试数据总共分成三类：
1）小型矩阵与向量计算， $N = M = 20$ ；
2）中型矩阵与向量计算， $N = M = 200$ ;
3) 大型矩阵与向量计算， $N = M = 2000$ 。

2.2 测试程序

double型

#include <iostream>
#include <vector>
#include <fstream>
#include <cmath>
#include <sys/timeb.h>
#include "iomanip"
#include <Eigen/Dense>

#include "NumCpp.hpp"
#include "boost/filesystem.hpp"

using namespace Eigen;
using namespace std;

int M = 2000;  // 20 200 2000
int N = 2000;  // 20 200 2000

Eigen::RowVectorXd construct_Phi2eigen(Eigen::Vector2d x, Eigen::Matrix2Xd ALD_mat)
{
    
    
	double delta = 1.2;
	Eigen::Matrix2Xd dmat = ALD_mat.colwise() - x;
	Eigen::Matrix2Xd dmat2 = dmat.array().pow(2);
	Eigen::RowVectorXd dmat2sum = -dmat2.colwise().sum()/(2.0*delta*delta);
	Eigen::RowVectorXd y = dmat2sum.array().exp();
	return y;
}

nc::NdArray<double> contruct_Phi2numcpp(nc::NdArray<double> x, nc::NdArray<double> ALD_buf)
{
    
    
	double delta = 1.2;
	nc::NdArray<double> c = ALD_buf - nc::tile(x, {
    
    ALD_buf.shape().rows , 1});
	nc::NdArray<double> e = nc::exp(-nc::sum(nc::power(c, 2), nc::Axis::COL)/(2.0*delta*delta));	
	return e;
}

int main()
{
    
    
	Eigen::Matrix2Xd X_mat = Eigen::Matrix2Xd::Random(M, N);
	Eigen::Vector2d x = Eigen::Vector2d::Random(M, 1);	
	struct timeb startTime, endTime;
	
	std::cout<<"test the time cost for Eigen with matrix"<<std::endl;
	ftime(&startTime);  
	for (size_t i = 0; i < 10000; i++)
	{
    
    
		auto phi2 = construct_Phi2eigen(x, X_mat);
	}	
	ftime(&endTime);
	std::cout << "time cost is：" << (endTime.time-startTime.time)*1000 + (endTime.millitm - startTime.millitm) << "ms" << std::endl;


	
	nc::NdArray<double> D_arr = nc::random::rand<double>({
    
    N, M});
	nc::NdArray<double> x1 = nc::random::rand<double>({
    
    1, M});


	std::cout<<"test the time cost for numcpp"<<std::endl;
	ftime(&startTime);  
	for (size_t i = 0; i < 10000; i++)
	{
    
    
		auto phi3 = contruct_Phi2numcpp(x1, D_arr);
		// std::cout<<phi3<<std::endl;
	}
	ftime(&endTime);
	std::cout << "time cost is：" << (endTime.time-startTime.time)*1000 + (endTime.millitm - startTime.millitm) << "ms" << std::endl;
}

float型

#include <iostream>
#include <vector>
#include <fstream>
#include <cmath>
#include <sys/timeb.h>
#include"iomanip"
#include <Eigen/Dense>

#include"NumCpp.hpp"
#include"boost/filesystem.hpp"

using namespace Eigen;
using namespace std;

int M = 2;  // 2 20 200
int N = 2;  // 2 20 200

Eigen::RowVectorXf construct_Phi2eigen(Eigen::VectorXf x, Eigen::MatrixXf ALD_mat)
{
    
    
	float delta = 1.2;
	Eigen::MatrixXf dmat = ALD_mat.colwise() - x;
	Eigen::MatrixXf dmat2 = dmat.array().pow(2);
	Eigen::RowVectorXf dmat2sum = -dmat2.colwise().sum()/(2.0f*delta*delta);
	Eigen::RowVectorXf y = dmat2sum.array().exp();
	return y;
}

nc::NdArray<float> contruct_Phi2numcpp(nc::NdArray<float> x, nc::NdArray<float> ALD_buf)
{
    
    
	float delta = 1.2;
	nc::NdArray<float> c = ALD_buf - nc::tile(x, {
    
    ALD_buf.shape().rows , 1});
	nc::NdArray<float> e = nc::exp(-nc::sum(nc::power(c, 2), nc::Axis::COL)/(2.0f*delta*delta));	
	return e;
}

int main()
{
    
    
	Eigen::MatrixXf X_mat = Eigen::MatrixXf::Random(M, N);
	Eigen::VectorXf x = Eigen::VectorXf::Random(M, 1);

	struct timeb startTime, endTime;

	std::cout<<"test the time cost for Eigen with matrix"<<std::endl;
	ftime(&startTime);  
	for (size_t i = 0; i < 10000; i++)
	{
    
    
		auto phi2 = construct_Phi2eigen(x, X_mat);
	}	
	ftime(&endTime);
	std::cout << "time cost is：" << (endTime.time-startTime.time)*1000 + (endTime.millitm - startTime.millitm) << "ms" << std::endl;


	nc::NdArray<float> x1 = nc::random::rand<float>({
    
    1, M});
	nc::NdArray<float> D_arr = nc::random::rand<float>({
    
    N, M});


	std::cout<<"test the time cost for numcpp"<<std::endl;
	ftime(&startTime);  
	for (size_t i = 0; i < 10000; i++)
	{
    
    
		auto phi3 = contruct_Phi2numcpp(x1, D_arr);
		// std::cout<<phi3<<std::endl;
	}
	ftime(&endTime);
	std::cout << "time cost is：" << (endTime.time-startTime.time)*1000 + (endTime.millitm - startTime.millitm) << "ms" << std::endl;
}

python for numpy and cupy

import cupy as cp
import numpy as np
import time

delta = 1.2

M = 2 # 2 20 200
N = 2 # 2 20 200

x_cp = cp.random.rand(1,M)
D_arr_cp = cp.random.rand(N, M)

x_np = np.random.rand(1,M)
D_arr_np = np.random.rand(N, M)

def construct_Phi2_cp(x, ALD_buf):
    ALD_arr = cp.array(ALD_buf)
    x = cp.tile(x, (len(ALD_buf), 1))
    dX = ALD_arr - x
    return cp.exp(-cp.sum(cp.power(dX, 2), axis=1)/2*delta*delta)
        
def construct_Phi2_np(x, ALD_buf):
    ALD_arr = np.array(ALD_buf)
    x = np.tile(x, (len(ALD_buf), 1))
    dX = ALD_arr - x
    return np.exp(-np.sum(np.power(dX, 2), axis=1)/2*delta*delta)
    
    
def testtimecp():
    for i in range(1000):
        construct_Phi2_cp(x_cp, D_arr_cp)
        
        
def testtimenp():
    for i in range(1000):
        construct_Phi2_np(x_np, D_arr_np)
        
print("time cost for cupy:")        
%time testtimecp()

print("time cost for numpy")
%time testtimenp()

2.3、测试结果

矩阵大小	M=2, N=2	M=20, N=20	M=200, N=200	M=200, N=400	M=400, N=200
Eigen	f(6ms)、d(6ms)	f(22ms)、d(30ms)	f(685ms)、d(2.283s)	f(2.222s)、d(4.845s))	f(1.567s)、d(4.843s)
NumCpp	f(10ms)、d(11ms)	f(38ms)、d(32ms)	f(1.51s)、d(1.854)	f(3.035s)、d(3.619s)	f(3.255s)、d(7.278s)
NumPy	195ms	234ms	3.25s	6.32s	6.32s
CuPy	1.83s	1.75s	1.75s	1.71s	1.81s

三、动态矩阵计算测试

3.1 测试程序

float cpp

#include <iostream>
#include <vector>
#include <fstream>
#include <cmath>
#include <sys/timeb.h>
#include"iomanip"
#include <Eigen/Dense>

#include"NumCpp.hpp"
#include"boost/filesystem.hpp"

using namespace Eigen;
using namespace std;

int M = 200; // 2 20 200
int N = 200; // 2 20 200

int main()
{
    
    
	Eigen::Matrix2Xf X_mat = Eigen::Matrix2Xf::Random(M, N);
	Eigen::Vector2f x = Eigen::Vector2f::Random(M, 1);


	std::cout<<"test the time cost for Eigen with matrix"<<std::endl;
	struct timeb startTime, endTime;
	ftime(&startTime);  
	for (size_t i = 0; i < 10000; i++)
	{
    
    
		X_mat = (Eigen::MatrixXf(X_mat.rows(), X_mat.cols()+x.cols())<<X_mat, x).finished();
	}	
	ftime(&endTime);
	std::cout << "time cost is：" << (endTime.time-startTime.time)*1000 + (endTime.millitm - startTime.millitm) << "ms" << std::endl;


	nc::NdArray<float> x1 = nc::random::rand<float>({
    
    M, 1});
	nc::NdArray<float> D_arr = nc::random::rand<float>({
    
    M, N});


	std::cout<<"test the time cost for numcpp"<<std::endl;
	ftime(&startTime);  
	for (size_t i = 0; i < 10000; i++)
	{
    
    
		D_arr = nc::column_stack({
    
    D_arr, x1});
		// std::cout<<phi3<<std::endl;
	}
	ftime(&endTime);
	std::cout << "time cost is：" << (endTime.time-startTime.time)*1000 + (endTime.millitm - startTime.millitm) << "ms" << std::endl;
}

double cpp

#include <iostream>
#include <vector>
#include <fstream>
#include <cmath>
#include <sys/timeb.h>
#include"iomanip"
#include <Eigen/Dense>

#include"NumCpp.hpp"
#include"boost/filesystem.hpp"

using namespace Eigen;
using namespace std;

int M = 200; // 2 20 200
int N = 200; // 2 20 200

int main()
{
    
    
	Eigen::MatrixXd X_mat = Eigen::MatrixXd::Random(M, N);
	Eigen::MatrixXd x = Eigen::VectorXd::Random(M, 1);


	std::cout<<"test the time cost for Eigen with matrix"<<std::endl;
	struct timeb startTime, endTime;
	ftime(&startTime);  
	for (size_t i = 0; i < 10000; i++)
	{
    
    
		X_mat = (Eigen::MatrixXd(X_mat.rows(), X_mat.cols()+x.cols())<<X_mat, x).finished();
	}	
	ftime(&endTime);
	std::cout << "time cost is：" << (endTime.time-startTime.time)*1000 + (endTime.millitm - startTime.millitm) << "ms" << std::endl;


	nc::NdArray<double> x1 = nc::random::rand<double>({
    
    M, 1});
	nc::NdArray<double> D_arr = nc::random::rand<double>({
    
    M, N});


	std::cout<<"test the time cost for numcpp"<<std::endl;
	ftime(&startTime);  
	for (size_t i = 0; i < 10000; i++)
	{
    
    
		D_arr = nc::column_stack({
    
    D_arr, x1});
		// std::cout<<phi3<<std::endl;
	}
	ftime(&endTime);
	std::cout << "time cost is：" << (endTime.time-startTime.time)*1000 + (endTime.millitm - startTime.millitm) << "ms" << std::endl;
}

python for numpy and cupy

import cupy as cp
import numpy as np
import time

M = 2 # 2 20 200
N = 2 # 2 20 200

x1_np = np.random.rand(M, 1)
D_arr_np = np.random.rand(M, N)

x1_cp = cp.random.rand(M, 1)
D_arr_cp = cp.random.rand(M, N)

def testtimecp():
    global D_arr_cp
    for i in range(10000):
        D_arr_cp = cp.column_stack((D_arr_cp, x1_cp))
        
        
def testtimenp():
    global D_arr_np
    for i in range(10000):
        D_arr_np = np.column_stack((D_arr_np, x1_np))
        
print("time cost for cupy:")        
%time testtimecp()

print("time cost for numpy")
%time testtimenp()

3.2 测试结果

矩阵大小	M=2, N=2	M=20, N=20	M=200, N=200
Eigen	f(203ms)、d(377ms)	f(1.427s)、d(2.875s)	f(20.29s)、d(42.458s)
NumCpp	f(130ms)、d(144ms)	f(1.416s)、d(1.319s)	f(15.673s)、d(18.797s)
NumPy	54ms	324ms	8.36s
CuPy	308ms	312ms	3.03s

四、结论与建议

4.1 结论

矩阵计算效率由高到低：Eigen>NumCpp>NumPy ；
动态矩阵运算（此处不断增加一列）的效率由高到低：NumPy>NumCpp>Eigen；
CuPy在数据维数较低时，没有体现出加速，但是应对高维数据时，有明显的加速；
一个有意思的现象，此处试验结果中数据为float型时，所需的计算成本越低。（最开始，我一直认为32位的float要比64位的double要省时，但是网上有人说：新版C++会将float转为double再处理，等于用float会增加额外的转换步骤，计算效率更低，但是本博客的试验结果又验证回最初的直觉。不知所以然，希望大神路过能给予解答。）

4.2 建议

当不存在大量动态矩阵扩展的算法，C++使用Eigen；
当数据维数很高时，强烈建议GPU加速；
当算法存在动态矩阵扩展时，需要通过试验确定合适的库，可能组合各种库为较合适的方案。

4.3 源起

写了一个动态RBF网络，原来一直在python语言下用，采用NumPy实现。最近需要在C++环境中使用，于是用Eigen实现了。发现，Eigen实现版本的计算成本比NumPy还高。于是又实现了NumCpp版本，发现还是NumPy效率最高。经过以上试验，终于窥见一些缘由：算法里同时需要大量矩阵乘、加、减运算，同时又有大量动态矩阵运算（增加一行或一列）。矩阵乘、加、减的运算相差较各库的动态矩阵计算效率相差小。这也是为什么会产生与直觉不一样的结果。其中，double型计算更加耗时。。。注意和下面结果相矛盾！！！

最终，我没用Eigen,而是采用C++调用Pyhon(也即NumPy实现)的方案。

下面列出动态RBF网络算法利用各库实现的计算成本：

方法	Eigen	NumCpp	NumPy
time cost	f(2.575s)、d(1.355s)	f(12.048s)、d(5.707s)	770ms

其中，float型计算更加耗时。。。怎么和上面结果相矛盾！！！

扫描二维码关注公众号，回复： 13038330 查看本文章

注意事项

C++编译存在DeBug与Release两种方式，DeBug一般用于调试，编译的版本会存储大量调试信息，实际运行时并不能反应算法的真实计算成本。一定要利用Release方式编译测试代码。

这里要感谢同事小张的提醒。最初测试时，发现Eigen和NumCpp的效率都要比NumPy要差。今天午饭时跟小张抱怨这个疑惑时，小张便给出问题的所在。

在CMakeLists.txt文件里添加下面两行，启动Release编译方式，并启动最高效率编译模式。

SET(CMAKE_BUILD_TYPE "Release")

by windSeS
2020-10-30
2020-11-5修补