Ne10编译和介绍

1.介绍

ARM® NEON™ 技术是适用于 ARM Cortex™-A 系列处理器的 SIMD（单指令多数据）架构扩展。它可以使多媒体和信号处理算法提速，例如视频编码/解码、2D/3D 图形、游戏、音频和语音处理以及图像处理等。 Ne10的问世，出现许多使用NEON 并显著改善用户体验的多媒体应用程序。有些应用程序开发人员可能不熟悉 NEON 汇编代码，因此 Ne10 库的创建可使开发人员从 ARMv7/NEON 中获得最大效益，而不必使用繁琐的汇编代码。

Ne10 库提供一组最为常用并且极为优化的函数。这组函数最初于 2012 年 3 月发布。库中的初始功能集着重于矩阵/矢量代数以及信号处理。 Ne10 将持续改进，以包含图像处理等多领域内的更多高计算量任务。

2.源码获取

Ne10的源码公开在github上面，其网站地址：https://github.com/projectNe10/Ne10 。

3.环境

3.1硬件环境

您需要准备ARM Cortex-A/R系列开发平台。如果没有硬件开发平台，也可使用仿真环境，如Google的Android Emulator。我现在使用的硬件开发板环境是arm-A53的平台，交叉编译平台ubuntu 16.04.

3.2软件环境

工具链：aarch64-linux-gnu-

CMake (http://www.cmake.org/)：跨平台的开源构建系统

4.编译和使用Ne0库

4.1编译Ne10

通过第2部分，获取源码后，进入源码目录，进行如下操作：

修改CMakeLists.txt.有二处修改，修改如下：

1. option(NE10_BUILD_UNIT_TEST "Build NE10 unit test" ON)  //原先为OFF
2. option(NE10_PERFORMANCE_TEST "Run performance test" ON)//原先为OFF

此处打开，源码中的测试程序和选择performance-test。关于smoke testing，regression testing， performancetesting的区别如下：

Conformance testing (also called smoke testing), to check if the library works correctly.
Regression testing, which is similar to conformance testing but is aimed more specifically at testing whether the library still operates correctly after a change.
Performance testing, which gives an indication of how quickly the library performs certain tasks.

2.修改GNUlinux_config.cmake

if(NOT DEFINED ENV{NE10_LINUX_TARGET_ARCH})
   set(NE10_LINUX_TARGET_ARCH "aarch64")
else()
//直接将此处设置为，aarch64

3.编译

mkdir build
cd build
cmake -DCMAKE_TOOLCHAIN_FILE=../GNUlinux_config.cmake ../
make -j8

此处是静态的编译方式，可以看到在build/modules/下面生成libNE10.

ccion@ubuntu:~/Ne10/build/modules$ ls
CMakeFiles  cmake_install.cmake  libNE10.a  Makefile

4.2使用和结果分析

通过上面的步骤可以看到，在build目录下面生成了test文件，有二个应用程序，这里在我的开发板平台上面执行FFT的执行程序-NE10-dsp_unit_test_static_performanc。其重要部分源码如下：

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>

#include "NE10_dsp.h"
#include "NE10_macros.h"
#include "seatest.h"
#include "unit_test_common.h"
void test_fft_c2c_1d_float32_performance()
{
    ne10_int32_t i = 0;
    ne10_int32_t fftSize = 0;
    ne10_int32_t flag_result = NE10_OK;
    ne10_int32_t test_loop = 0;

    fprintf (stdout, "----------%30s start\n", __FUNCTION__);
    fprintf (stdout, "%25s%20s%20s%20s%20s\n", "FFT Length", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");

    for (fftSize = MIN_LENGTH_SAMPLES_CPX; fftSize <= TEST_LENGTH_SAMPLES; fftSize *= 2)
    {
        fprintf (stdout, "FFT size %d\n", fftSize);

        /* FFT test */
        memcpy (in_c, testInput_f32, 2 * fftSize * sizeof (ne10_float32_t));
        memcpy (in_neon, testInput_f32, 2 * fftSize * sizeof (ne10_float32_t));
        flag_result = test_c2c_alloc (fftSize);
        if (flag_result == NE10_ERR)
        {
            return;
        }

        test_loop = TEST_COUNT / fftSize;

        GET_TIME
        (
            time_c,
        {
            for (i = 0; i < test_loop; i++)
                ne10_fft_c2c_1d_float32_c ( (ne10_fft_cpx_float32_t*) out_c, (ne10_fft_cpx_float32_t*) in_c, cfg_c, 0);
        }
        );
        GET_TIME
        (
            time_neon,
        {
            for (i = 0; i < test_loop; i++)
                ne10_fft_c2c_1d_float32_neon ( (ne10_fft_cpx_float32_t*) out_neon, (ne10_fft_cpx_float32_t*) in_neon, cfg_neon, 0);
        }
        );

        time_speedup = (ne10_float32_t) time_c / time_neon;
        time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
        ne10_log (__FUNCTION__, "Float FFT%21d%20lld%20lld%19.2f%%%18.2f:1\n", fftSize, time_c, time_neon, time_savings, time_speedup);

        /* IFFT test */
        memcpy (in_c, out_c, 2 * fftSize * sizeof (ne10_float32_t));
        memcpy (in_neon, out_c, 2 * fftSize * sizeof (ne10_float32_t));

        GET_TIME
        (
            time_c,
        {
            for (i = 0; i < test_loop; i++)
                ne10_fft_c2c_1d_float32_c ( (ne10_fft_cpx_float32_t*) out_c, (ne10_fft_cpx_float32_t*) in_c, cfg_c, 1);
        }
        );
        GET_TIME
        (
            time_neon,
        {
            for (i = 0; i < test_loop; i++)
                ne10_fft_c2c_1d_float32_neon ( (ne10_fft_cpx_float32_t*) out_neon, (ne10_fft_cpx_float32_t*) in_neon, cfg_neon, 1);
        }
        );

        time_speedup = (ne10_float32_t) time_c / time_neon;
        time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
        ne10_log (__FUNCTION__, "Float FFT%21d%20lld%20lld%19.2f%%%18.2f:1\n", fftSize, time_c, time_neon, time_savings, time_speedup);

        NE10_FREE (cfg_c);
        NE10_FREE (cfg_neon);
    }
}

执行结果：

可以看到，在FFT>8之后采用Ne10版本比纯c版效率高很多，但是在处理2,4,8个FFT时，Ne10的效率居然还没有c高。

再来看看处理图像的效率问题：执行NE10_imgproc_unit_test_statci_performanc。其重要源码如下：

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>

#include "NE10_imgproc.h"
#include "seatest.h"
#include "unit_test_common.h"

void test_resize_performance_case()
{
    ne10_int32_t srcw;
    ne10_int32_t srch;
    ne10_int32_t dstw;
    ne10_int32_t dsth;
    ne10_int32_t i;
    ne10_int32_t w, h;
    ne10_int32_t channels = 4;
    ne10_int32_t pic_size = MEM_SIZE * MEM_SIZE * channels * sizeof (ne10_uint8_t);
    ne10_int64_t time_c = 0;
    ne10_int64_t time_neon = 0;

    /* init input memory */
    in_c = NE10_MALLOC (pic_size);
    in_neon = NE10_MALLOC (pic_size);

    /* init dst memory */
    out_c = NE10_MALLOC (pic_size);
    out_neon = NE10_MALLOC (pic_size);

    for (i = 0; i < pic_size; i++)
    {
        in_c[i] = in_neon[i] = (rand() & 0xff);
    }

    for (h = 16; h < MEM_SIZE; h += 4)
    {
        for (w = 16; w < MEM_SIZE; w += 4)
        {
            srcw = h;
            srch = h;
            dstw = w;
            dsth = w;

            printf ("srcw X srch = %d X %d \n", srcw, srch);
            printf ("dstw X dsth = %d X %d \n", dstw, dsth);

            GET_TIME
            (
                time_c,
            {
                for (i = 0; i < TEST_COUNT; i++)
                    ne10_img_resize_bilinear_rgba_c (out_c, dstw, dsth, in_c, srcw, srch, srcw);
            }
            );

            GET_TIME
            (
                time_neon,
            {
                for (i = 0; i < TEST_COUNT; i++)
                    ne10_img_resize_bilinear_rgba_neon (out_neon, dstw, dsth, in_neon, srcw, srch, srcw);
            }
            );
            printf ("time c %lldus \n", time_c);
            printf ("time neon %lldus \n", time_neon);
            ne10_log (__FUNCTION__, "IMAGERESIZE%20d%20lld%20lld%19.2f%%%18.2f:1\n", (h * MEM_SIZE + w), time_c, time_neon, 0, 0);

        }
    }
    NE10_FREE (in_c);
    NE10_FREE (in_neon);
    NE10_FREE (out_c);
    NE10_FREE (out_neon);
}

执行结果：

很明显，做图像resize时，neon版本的要比c版本的效率搞很多

1.介绍

2.源码获取

3.环境

4.编译和使用Ne0库

猜你喜欢