1.介绍
ARM® NEON™ 技术是适用于 ARM Cortex™-A 系列处理器的 SIMD(单指令多数据)架构扩展。 它可以使多媒体和信号处理算法提速,例如视频编码/解码、2D/3D 图形、游戏、音频和语音处理以及图像处理等。 Ne10的问世,出现许多使用NEON 并显著改善用户体验的多媒体应用程序。 有些应用程序开发人员可能不熟悉 NEON 汇编代码,因此 Ne10 库的创建可使开发人员从 ARMv7/NEON 中获得最大效益,而不必使用繁琐的汇编代码。
Ne10 库提供一组最为常用并且极为优化的函数。 这组函数最初于 2012 年 3 月发布。 库中的初始功能集着重于矩阵/矢量代数以及信号处理。 Ne10 将持续改进,以包含图像处理等多领域内的更多高计算量任务。
2.源码获取
Ne10的源码公开在github上面,其网站地址:https://github.com/projectNe10/Ne10 。
3.环境
3.1硬件环境
您需要准备ARM Cortex-A/R系列开发平台。如果没有硬件开发平台,也可使用仿真环境,如Google的Android Emulator。我现在使用的硬件开发板环境是arm-A53的平台,交叉编译平台ubuntu 16.04.
3.2软件环境
- 工具链:aarch64-linux-gnu-
- CMake (http://www.cmake.org/):跨平台的开源构建系统
4.编译和使用Ne0库
4.1编译Ne10
通过第2部分,获取源码后,进入源码目录,进行如下操作:
- 修改CMakeLists.txt.有二处修改,修改如下:
1. option(NE10_BUILD_UNIT_TEST "Build NE10 unit test" ON) //原先为OFF
2. option(NE10_PERFORMANCE_TEST "Run performance test" ON)//原先为OFF
此处打开,源码中的测试程序和选择performance-test。关于smoke testing,regression testing, performancetesting的区别如下:
- Conformance testing (also called smoke testing), to check if the library works correctly.
- Regression testing, which is similar to conformance testing but is aimed more specifically at testing whether the library still operates correctly after a change.
- Performance testing, which gives an indication of how quickly the library performs certain tasks.
2.修改GNUlinux_config.cmake
if(NOT DEFINED ENV{NE10_LINUX_TARGET_ARCH})
set(NE10_LINUX_TARGET_ARCH "aarch64")
else()
//直接将此处设置为,aarch64
3.编译
mkdir build
cd build
cmake -DCMAKE_TOOLCHAIN_FILE=../GNUlinux_config.cmake ../
make -j8
此处是静态的编译方式,可以看到在build/modules/下面生成libNE10.
ccion@ubuntu:~/Ne10/build/modules$ ls
CMakeFiles cmake_install.cmake libNE10.a Makefile
4.2使用和结果分析
通过上面的步骤可以看到,在build目录下面生成了test文件,有二个应用程序,这里在我的开发板平台上面执行FFT的执行程序-NE10-dsp_unit_test_static_performanc。其重要部分源码如下:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include "NE10_dsp.h"
#include "NE10_macros.h"
#include "seatest.h"
#include "unit_test_common.h"
void test_fft_c2c_1d_float32_performance()
{
ne10_int32_t i = 0;
ne10_int32_t fftSize = 0;
ne10_int32_t flag_result = NE10_OK;
ne10_int32_t test_loop = 0;
fprintf (stdout, "----------%30s start\n", __FUNCTION__);
fprintf (stdout, "%25s%20s%20s%20s%20s\n", "FFT Length", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
for (fftSize = MIN_LENGTH_SAMPLES_CPX; fftSize <= TEST_LENGTH_SAMPLES; fftSize *= 2)
{
fprintf (stdout, "FFT size %d\n", fftSize);
/* FFT test */
memcpy (in_c, testInput_f32, 2 * fftSize * sizeof (ne10_float32_t));
memcpy (in_neon, testInput_f32, 2 * fftSize * sizeof (ne10_float32_t));
flag_result = test_c2c_alloc (fftSize);
if (flag_result == NE10_ERR)
{
return;
}
test_loop = TEST_COUNT / fftSize;
GET_TIME
(
time_c,
{
for (i = 0; i < test_loop; i++)
ne10_fft_c2c_1d_float32_c ( (ne10_fft_cpx_float32_t*) out_c, (ne10_fft_cpx_float32_t*) in_c, cfg_c, 0);
}
);
GET_TIME
(
time_neon,
{
for (i = 0; i < test_loop; i++)
ne10_fft_c2c_1d_float32_neon ( (ne10_fft_cpx_float32_t*) out_neon, (ne10_fft_cpx_float32_t*) in_neon, cfg_neon, 0);
}
);
time_speedup = (ne10_float32_t) time_c / time_neon;
time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
ne10_log (__FUNCTION__, "Float FFT%21d%20lld%20lld%19.2f%%%18.2f:1\n", fftSize, time_c, time_neon, time_savings, time_speedup);
/* IFFT test */
memcpy (in_c, out_c, 2 * fftSize * sizeof (ne10_float32_t));
memcpy (in_neon, out_c, 2 * fftSize * sizeof (ne10_float32_t));
GET_TIME
(
time_c,
{
for (i = 0; i < test_loop; i++)
ne10_fft_c2c_1d_float32_c ( (ne10_fft_cpx_float32_t*) out_c, (ne10_fft_cpx_float32_t*) in_c, cfg_c, 1);
}
);
GET_TIME
(
time_neon,
{
for (i = 0; i < test_loop; i++)
ne10_fft_c2c_1d_float32_neon ( (ne10_fft_cpx_float32_t*) out_neon, (ne10_fft_cpx_float32_t*) in_neon, cfg_neon, 1);
}
);
time_speedup = (ne10_float32_t) time_c / time_neon;
time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
ne10_log (__FUNCTION__, "Float FFT%21d%20lld%20lld%19.2f%%%18.2f:1\n", fftSize, time_c, time_neon, time_savings, time_speedup);
NE10_FREE (cfg_c);
NE10_FREE (cfg_neon);
}
}
执行结果:
可以看到,在FFT>8之后采用Ne10版本比纯c版效率高很多,但是在处理2,4,8个FFT时,Ne10的效率居然还没有c高。
再来看看处理图像的效率问题:执行NE10_imgproc_unit_test_statci_performanc。其重要源码如下:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include "NE10_imgproc.h"
#include "seatest.h"
#include "unit_test_common.h"
void test_resize_performance_case()
{
ne10_int32_t srcw;
ne10_int32_t srch;
ne10_int32_t dstw;
ne10_int32_t dsth;
ne10_int32_t i;
ne10_int32_t w, h;
ne10_int32_t channels = 4;
ne10_int32_t pic_size = MEM_SIZE * MEM_SIZE * channels * sizeof (ne10_uint8_t);
ne10_int64_t time_c = 0;
ne10_int64_t time_neon = 0;
/* init input memory */
in_c = NE10_MALLOC (pic_size);
in_neon = NE10_MALLOC (pic_size);
/* init dst memory */
out_c = NE10_MALLOC (pic_size);
out_neon = NE10_MALLOC (pic_size);
for (i = 0; i < pic_size; i++)
{
in_c[i] = in_neon[i] = (rand() & 0xff);
}
for (h = 16; h < MEM_SIZE; h += 4)
{
for (w = 16; w < MEM_SIZE; w += 4)
{
srcw = h;
srch = h;
dstw = w;
dsth = w;
printf ("srcw X srch = %d X %d \n", srcw, srch);
printf ("dstw X dsth = %d X %d \n", dstw, dsth);
GET_TIME
(
time_c,
{
for (i = 0; i < TEST_COUNT; i++)
ne10_img_resize_bilinear_rgba_c (out_c, dstw, dsth, in_c, srcw, srch, srcw);
}
);
GET_TIME
(
time_neon,
{
for (i = 0; i < TEST_COUNT; i++)
ne10_img_resize_bilinear_rgba_neon (out_neon, dstw, dsth, in_neon, srcw, srch, srcw);
}
);
printf ("time c %lldus \n", time_c);
printf ("time neon %lldus \n", time_neon);
ne10_log (__FUNCTION__, "IMAGERESIZE%20d%20lld%20lld%19.2f%%%18.2f:1\n", (h * MEM_SIZE + w), time_c, time_neon, 0, 0);
}
}
NE10_FREE (in_c);
NE10_FREE (in_neon);
NE10_FREE (out_c);
NE10_FREE (out_neon);
}
执行结果:
很明显,做图像resize时,neon版本的要比c版本的效率搞很多