sobel3*3算子的计算过程
void sobel3x3( const uint8_t* in, uint8_t* out_v, uint8_t* out_h, int w, int h ) {
int16_t* temp_h = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
int16_t* temp_v = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
detail::convolve_cols_3x3( in, temp_v, temp_h, w, h );
detail::convolve_101_row_3x3_16bit( temp_v, out_v, w, h );
detail::convolve_121_row_3x3_16bit( temp_h, out_h, w, h );
_mm_free( temp_h );
_mm_free( temp_v );
}
_mm_malloc()
_mm_malloc()主要是为了内存对齐,运行速度快,这里以16位对齐
详情可以看为什么要使用_mm_malloc? (与_aligned_malloc,alligned_alloc或posix_memalign相对)
如何实现最高传输速率
convolve_cols_3x3 列卷积
void convolve_cols_3x3( const unsigned char* in, int16_t* out_v, int16_t* out_h, int w, int h ) {
using namespace std;
assert( w % 16 == 0 && "width must be multiple of 16!" );
const int w_chunk = w/16;
__m128i* i0 = (__m128i*)( in );
__m128i* i1 = (__m128i*)( in ) + w_chunk*1;
__m128i* i2 = (__m128i*)( in ) + w_chunk*2;
__m128i* result_h = (__m128i*)( out_h ) + 2*w_chunk;
__m128i* result_v = (__m128i*)( out_v ) + 2*w_chunk;
__m128i* end_input = (__m128i*)( in ) + w_chunk*h;
for( ; i2 != end_input; i0++, i1++, i2++, result_v+=2, result_h+=2 ) {
*result_h = _mm_setzero_si128();
*(result_h+1) = _mm_setzero_si128();
*result_v = _mm_setzero_si128();
*(result_v+1) = _mm_setzero_si128();
__m128i ilo, ihi;
unpack_8bit_to_16bit( *i0, ihi, ilo );
unpack_8bit_to_16bit( *i0, ihi, ilo );
*result_h = _mm_add_epi16( ihi, *result_h );
*(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
*result_v = _mm_add_epi16( *result_v, ihi );
*(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
unpack_8bit_to_16bit( *i1, ihi, ilo );
*result_v = _mm_add_epi16( *result_v, ihi );
*(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
*result_v = _mm_add_epi16( *result_v, ihi );
*(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
unpack_8bit_to_16bit( *i2, ihi, ilo );
*result_h = _mm_sub_epi16( *result_h, ihi );
*(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
*result_v = _mm_add_epi16( *result_v, ihi );
*(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
}
}
convolve_101_row_3x3 101的行卷积
// convolve image with a (1,0,-1) row vector. Result is accumulated into output.
// This one works on 16bit input and 8bit output.
// output is scaled by 1/4, then clamped to [-128,128], and finally shifted to [0,255].
void convolve_101_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
assert( w % 16 == 0 && "width must be multiple of 16!" );
const __m128i* i0 = (const __m128i*)(in);
const int16_t* i2 = in+2;
uint8_t* result = out + 1;
const int16_t* const end_input = in + w*h;
const size_t blocked_loops = (w*h-2)/16;
__m128i offs = _mm_set1_epi16( 128 );
for( size_t i=0; i != blocked_loops; i++ ) {
__m128i result_register_lo;
__m128i result_register_hi;
__m128i i2_register;
i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
result_register_lo = *i0;
result_register_lo = _mm_sub_epi16( result_register_lo, i2_register );
result_register_lo = _mm_srai_epi16( result_register_lo, 2 );
result_register_lo = _mm_add_epi16( result_register_lo, offs );
i0 += 1;
i2 += 8;
i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
result_register_hi = *i0;
result_register_hi = _mm_sub_epi16( result_register_hi, i2_register );
result_register_hi = _mm_srai_epi16( result_register_hi, 2 );
result_register_hi = _mm_add_epi16( result_register_hi, offs );
i0 += 1;
i2 += 8;
pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
_mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
result += 16;
}
for( ; i2 < end_input; i2++, result++) {
*result = ((*(i2-2) - *i2)>>2)+128;
}
}
convolve_121_row_3x3 121的行卷积
// convolve image with a (1,2,1) row vector. Result is accumulated into output.
// This one works on 16bit input and 8bit output.
// output is scaled by 1/4, then clamped to [-128,128], and finally shifted to [0,255].
void convolve_121_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
assert( w % 16 == 0 && "width must be multiple of 16!" );
const __m128i* i0 = (const __m128i*)(in);
const int16_t* i1 = in+1;
const int16_t* i2 = in+2;
uint8_t* result = out + 1;
const int16_t* const end_input = in + w*h;
const size_t blocked_loops = (w*h-2)/16;
__m128i offs = _mm_set1_epi16( 128 );
for( size_t i=0; i != blocked_loops; i++ ) {
__m128i result_register_lo;
__m128i result_register_hi;
__m128i i1_register;
__m128i i2_register;
i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
result_register_lo = *i0;
i1_register = _mm_add_epi16( i1_register, i1_register );
result_register_lo = _mm_add_epi16( i1_register, result_register_lo );
result_register_lo = _mm_add_epi16( i2_register, result_register_lo );
result_register_lo = _mm_srai_epi16( result_register_lo, 2 );
result_register_lo = _mm_add_epi16( result_register_lo, offs );
i0++;
i1+=8;
i2+=8;
i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
result_register_hi = *i0;
i1_register = _mm_add_epi16( i1_register, i1_register );
result_register_hi = _mm_add_epi16( i1_register, result_register_hi );
result_register_hi = _mm_add_epi16( i2_register, result_register_hi );
result_register_hi = _mm_srai_epi16( result_register_hi, 2 );
result_register_hi = _mm_add_epi16( result_register_hi, offs );
i0++;
i1+=8;
i2+=8;
pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
_mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
result += 16;
}
}
sobel算子的矩阵形式
Gx相当于先给图像块做列卷积
,再进行行卷积
Gy相当于先给图像块做列卷积
,再进行行卷积