版权声明:本文为博主原创文章,欢迎转载。 https://blog.csdn.net/samylee/article/details/88874899
头文件 | 指令集描述 |
intrin.h | All Architectures |
mmintrin.h | MMX |
xmmintrin.h | SSE |
emmintrin.h | SSE2 |
pmmintrin.h | SSE3 |
smmintrin.h | SSE4.1 |
nmmintrin.h | SSE4.2 |
immintrin.h | AVX |
math_function.h
#pragma once
#include <immintrin.h>
#include <stdio.h>
float MathSum(const float *input, int size);
float SSESum(const float *input, int size);
float AVXSum(const float *input, int size);
math_function.cpp
#include "math_function.h"
float MathSum(const float *input, int size)
{
float output = 0.0;
for (int i = 0; i < size; i++)
{
output += input[i];
}
return output;
}
float SSESum(const float *input, int size)
{
if (input == nullptr)
{
printf("input data is null\n");
return -1;
}
int nBlockWidth = 4;
int cntBlock = size / nBlockWidth;
int cntRem = size % nBlockWidth;
float output = 0;
__m128 loadData;
__m128 sumData = _mm_setzero_ps();
const float *p = input;
for (int i = 0; i < cntBlock; i++)
{
loadData = _mm_load_ps(p);
sumData = _mm_add_ps(sumData, loadData);
p += nBlockWidth;
}
sumData = _mm_hadd_ps(sumData, sumData); // p[0] + p[1] + p[4] + p[5] + ...
sumData = _mm_hadd_ps(sumData, sumData); // p[2] + p[3] + p[6] + p[7] + ...
output += sumData.m128_f32[(0)]; // 前4组
for (int i = 0; i < cntRem; i++)
{
output += p[i];
}
return output;
}
float AVXSum(const float *input, int size)
{
if (input == nullptr)
{
printf("input data is null\n");
return -1;
}
int nBlockWidth = 8;
int cntBlock = size / nBlockWidth;
int cntRem = size % nBlockWidth;
float output = 0;
__m256 loadData;
__m256 sumData = _mm256_setzero_ps();
const float *p = input;
for (int i = 0; i < cntBlock; i++)
{
loadData = _mm256_load_ps(p);
sumData = _mm256_add_ps(sumData, loadData);
p += nBlockWidth;
}
sumData = _mm256_hadd_ps(sumData, sumData); // p[0] + p[1] + p[4] + p[5] + p[8] + p[9] + p[12] + p[13] + ...
sumData = _mm256_hadd_ps(sumData, sumData); // p[2] + p[3] + p[6] + p[7] + p[10] + p[11] + p[14] + p[15] + ...
output += sumData.m256_f32[(0)]; // 前4组
output += sumData.m256_f32[(4)]; // 后4组
for (int i = 0; i < cntRem; i++)
{
output += p[i];
}
return output;
}
main.cpp
#include "math_function.h"
#include <time.h>
int main(int argc, char* argv[])
{
int size = 27;
float *input = (float *)malloc(sizeof(float) * size);
for (int i = 0; i < size; i++)
input[i] = 0.0025;
int cntLoop = 300000000;
clock_t start_t = clock();
float org = 0.0;
for (int i = 0; i < cntLoop; i++)
org = MathSum(input, size);
printf("org = %f\t", org);
printf("cost time: %d\n", clock() - start_t);
start_t = clock();
float sse = 0.0;
for (int i = 0; i < cntLoop; i++)
sse = SSESum(input, size);
printf("sse = %f\t", sse);
printf("cost time: %d\n", clock() - start_t);
start_t = clock();
float avx = 0.0;
for (int i = 0; i < cntLoop; i++)
avx = AVXSum(input, size);
printf("avx = %f\t", avx);
printf("cost time: %d\n", clock() - start_t);
getchar();
free(input);
return 0;
}
运行结果
测试硬件:CPU-4790-4core
预处理器:_WINDOWS
命令行:/arch:AVX
任何问题请加唯一QQ2258205918(名称samylee)!