在neon_programmers_guide中有个Swapping color channels示例,展示了neon技术的加速效果。
这里我编程实现了下,观察下加速效果,在两个平台上做对比试验。
普通arm | neon加速 | |
nvidia tk1 | ||
respbarry pi |
1 #include <stdlib.h> 2 #include <memory.h> 3 #include <fstream> 4 #include <sys/time.h> 5 6 #include "/usr/lib/gcc-cross/arm-linux-gnueabihf/5/include/arm_neon.h" 7 8 using namespace std; 9 10 int main(int argc, char **argv) 11 { 12 int width = 1920; 13 int height = 1080; 14 15 int pixel_number = width * height; 16 int image_size = width * height * 3; 17 18 struct timeval tstart, tend; 19 20 unsigned char *rgb_buffer = (unsigned char *)malloc(image_size); 21 unsigned char *rgb_buffer_result = (unsigned char *)malloc(image_size); 22 23 for (int i = 0; i < pixel_number; i++) 24 { 25 rgb_buffer[3 * i + 0] = 1; 26 rgb_buffer[3 * i + 1] = 2; 27 rgb_buffer[3 * i + 2] = 3; 28 } 29 30 fstream wfile("./origin.dat", ios::binary | ios::out); 31 wfile.write((char *)rgb_buffer, image_size); 32 wfile.close(); 33 34 gettimeofday(&tstart, NULL); 35 36 for (int i = 0; i < pixel_number; i++) 37 { 38 rgb_buffer_result[i] = rgb_buffer[i * 3]; 39 rgb_buffer_result[i + pixel_number] = rgb_buffer[i * 3 + 1]; 40 rgb_buffer_result[i + 2 * pixel_number] = rgb_buffer[i * 3 + 2]; 41 } 42 43 gettimeofday(&tend, NULL); 44 int timeuse = (1000000 * (tend.tv_sec - tstart.tv_sec) + (tend.tv_usec - tstart.tv_usec)) / 1000; 45 printf("tk1 de_inter with arm (1920*1080) cost time(ms) = %d\n", timeuse); 46 47 fstream wfile1("./arm.dat", ios::binary | ios::out); 48 wfile1.write((char *)rgb_buffer_result, image_size); 49 wfile1.close(); 50 51 memset(rgb_buffer_result, 0 , image_size); 52 53 uint8x16x3_t neon_1; 54 55 gettimeofday(&tstart, NULL); 56 57 for (int i = 0; i < image_size / 48; i++) 58 { 59 neon_1 = vld3q_u8((uint8_t *)rgb_buffer + i * 48); 60 vst1q_u8(rgb_buffer_result + i * 16, neon_1.val[0]); 61 vst1q_u8(rgb_buffer_result + pixel_number + i * 16, neon_1.val[1]); 62 vst1q_u8(rgb_buffer_result + 2 * pixel_number + i * 16, neon_1.val[2]); 63 } 64 65 gettimeofday(&tend, NULL); 66 timeuse = (1000000 * (tend.tv_sec - tstart.tv_sec) + (tend.tv_usec - tstart.tv_usec)) / 1000; 67 printf("tk1 de_inter with neon (1920*1080) cost time(ms) = %d\n", timeuse); 68 69 fstream wfile2("./neon.dat", ios::binary | ios::out); 70 wfile2.write((char *)rgb_buffer_result, image_size); 71 wfile2.close(); 72 73 return 0; 74 }
// Mackfile
arm-linux-gnueabihf-g++ de_inter.cpp -o de_inter.out -mfpu=neon -mfloat-abi=hard
tk1 de_inter with arm (1920*1080) cost time(ms) = 43
tk1 de_inter with neon (1920*1080) cost time(ms) = 15