C/C++标准库的ceil()函数,能否加速一下?
在OpenCV中,看到了cvCeil()函数,它是用sse2加速的。
其实现在我用的PC,avx2都有支持了,sse、avx系列是递增式支持的,用sse4.1来优化一下,应该还是能普遍使用的。
本质上大多数人写的大多数程序,都是调用现成的包/API,无非是调包侠也有鄙视链而已。
今天做一回SSE4.1的调包侠,让原本100ms水平的计算过程,优化到25ms,比clang-8.0(实际上是它的标准库)里的ceil()再快5ms:
#include <iostream>
#include <cmath>
#include <sstream>
#include <chrono>
//sse2
#if defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)
#include <emmintrin.h>
#endif
//sse 4.1
#include <smmintrin.h>
//sse2 optimized
inline int cvCeil(double value)
{
#if defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)
__m128d t = _mm_set_sd( value );
int i = _mm_cvtsd_si32(t);
return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t,i), t));
#elif defined __GNUC__
int i = (int)value;
return i + (i < value);
#else
int i = cvRound(value);
float diff = (float)(i - value);
return i + (diff < 0);
#endif
}
//sse4 optimized
inline int myCeil(double value)
{
#if defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)
__m128d val = _mm_set_sd(value);
__m128d dst;
_mm_round_sd(dst, val, _MM_FROUND_CEIL);
return _mm_cvtsd_si32(dst);
//__m128d t = _mm_set_sd( value );
//int i = _mm_cvtsd_si32(t);
//return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t,i), t));
#elif defined __GNUC__
int i = (int)value;
return i + (i < value);
#else
int i = cvRound(value);
float diff = (float)(i - value);
return i + (diff < 0);
#endif
}
template<typename T, typename P>
std::string toString(std::chrono::duration<T,P> dt)
{
std::ostringstream str;
using namespace std::chrono;
str << duration_cast<microseconds>(dt).count()*1e-3 << " ms";
return str.str();
}
int main () {
volatile double x = 34.234;
volatile double y1, y2, y3;
const int MAX_ITER=100000000;
const auto t0 = std::chrono::steady_clock::now();
for(int i=0; i<MAX_ITER; i++) {
y1 = std::ceil(x);
}
const auto t1 = std::chrono::steady_clock::now();
for(int i=0; i<MAX_ITER; i++) {
y2 = cvCeil(x);
}
const auto t2 = std::chrono::steady_clock::now();
for(int i=0; i<MAX_ITER; i++) {
y3 = myCeil(x);
}
const auto t3 = std::chrono::steady_clock::now();
std::cout << "std::ceil: " << toString(t1-t0) << "\n";
std::cout << "cvCeil : " << toString(t2-t1) << "\n";
std::cout << "myCeil : " << toString(t3-t2) << "\n";
std::cout << "y1=" << y1 << ", y2=" << y2 << ", y3=" << y3 << std::endl;
return 0;
}
编译指令:
clang++-8 main8.cpp -O3 -o a8 -std=c++11 -msse4
运行输出:
std::ceil: 30.347 ms
cvCeil : 106.99 ms
myCeil : 25.318 ms
y1=35, y2=35, y3=35
refs:
is cvCeil() faster than standard library?
MSDN - _mm_round_sd