sobel3*3算子的计算过程

_mm_malloc()

convolve_cols_3x3 列卷积
convolve_101_row_3x3 101的行卷积
convolve_121_row_3x3 121的行卷积

sobel算子的矩阵形式

  void sobel3x3( const uint8_t* in, uint8_t* out_v, uint8_t* out_h, int w, int h ) {
    int16_t* temp_h = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
    int16_t* temp_v = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );    
    detail::convolve_cols_3x3( in, temp_v, temp_h, w, h );
    detail::convolve_101_row_3x3_16bit( temp_v, out_v, w, h );
    detail::convolve_121_row_3x3_16bit( temp_h, out_h, w, h );
    _mm_free( temp_h );
    _mm_free( temp_v );
  }

_mm_malloc()

_mm_malloc()主要是为了内存对齐，运行速度快，这里以16位对齐
详情可以看为什么要使用_mm_malloc？ (与_aligned_malloc,alligned_alloc或posix_memalign相对)
如何实现最高传输速率

convolve_cols_3x3 列卷积

void convolve_cols_3x3( const unsigned char* in, int16_t* out_v, int16_t* out_h, int w, int h ) {
  using namespace std;
  assert( w % 16 == 0 && "width must be multiple of 16!" );
  const int w_chunk  = w/16;
  __m128i* 	i0       = (__m128i*)( in );
  __m128i* 	i1       = (__m128i*)( in ) + w_chunk*1;
  __m128i* 	i2       = (__m128i*)( in ) + w_chunk*2;
  __m128i* result_h  = (__m128i*)( out_h ) + 2*w_chunk;
  __m128i* result_v  = (__m128i*)( out_v ) + 2*w_chunk;
  __m128i* end_input = (__m128i*)( in ) + w_chunk*h;
  for( ; i2 != end_input; i0++, i1++, i2++, result_v+=2, result_h+=2 ) {
    *result_h     = _mm_setzero_si128();
    *(result_h+1) = _mm_setzero_si128();
    *result_v     = _mm_setzero_si128();
    *(result_v+1) = _mm_setzero_si128();
    __m128i ilo, ihi;
    unpack_8bit_to_16bit( *i0, ihi, ilo ); 
    unpack_8bit_to_16bit( *i0, ihi, ilo );
    *result_h     = _mm_add_epi16( ihi, *result_h );
    *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
    *result_v     = _mm_add_epi16( *result_v, ihi );
    *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
    unpack_8bit_to_16bit( *i1, ihi, ilo );
    *result_v     = _mm_add_epi16( *result_v, ihi );
    *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
    *result_v     = _mm_add_epi16( *result_v, ihi );
    *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
    unpack_8bit_to_16bit( *i2, ihi, ilo );
    *result_h     = _mm_sub_epi16( *result_h, ihi );
    *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
    *result_v     = _mm_add_epi16( *result_v, ihi );
    *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
  }
}

convolve_101_row_3x3 101的行卷积

 // convolve image with a (1,0,-1) row vector. Result is accumulated into output.
    // This one works on 16bit input and 8bit output.
    // output is scaled by 1/4, then clamped to [-128,128], and finally shifted to [0,255].
    void convolve_101_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
      assert( w % 16 == 0 && "width must be multiple of 16!" );
      const __m128i*  i0 = (const __m128i*)(in);
      const int16_t* 	i2 = in+2;
      uint8_t* result    = out + 1;
      const int16_t* const end_input = in + w*h;
      const size_t blocked_loops = (w*h-2)/16;
      __m128i offs = _mm_set1_epi16( 128 );
      for( size_t i=0; i != blocked_loops; i++ ) {
        __m128i result_register_lo;
        __m128i result_register_hi;
        __m128i i2_register;

        i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
        result_register_lo  = *i0;
        result_register_lo  = _mm_sub_epi16( result_register_lo, i2_register );
        result_register_lo  = _mm_srai_epi16( result_register_lo, 2 );
        result_register_lo  = _mm_add_epi16( result_register_lo, offs );
 
        i0 += 1;
        i2 += 8;
        
        i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
        result_register_hi  = *i0;
        result_register_hi  = _mm_sub_epi16( result_register_hi, i2_register );
        result_register_hi  = _mm_srai_epi16( result_register_hi, 2 );
        result_register_hi  = _mm_add_epi16( result_register_hi, offs );

        i0 += 1;
        i2 += 8;
        
        pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
        _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );

        result += 16;
      }

      for( ; i2 < end_input; i2++, result++) {
        *result = ((*(i2-2) - *i2)>>2)+128;
      }
    }

convolve_121_row_3x3 121的行卷积

 // convolve image with a (1,2,1) row vector. Result is accumulated into output.
    // This one works on 16bit input and 8bit output.
    // output is scaled by 1/4, then clamped to [-128,128], and finally shifted to [0,255].
    void convolve_121_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
      assert( w % 16 == 0 && "width must be multiple of 16!" );
      const __m128i* i0 = (const __m128i*)(in);
      const int16_t* i1 = in+1;
      const int16_t* i2 = in+2;
      uint8_t* result   = out + 1;
      const int16_t* const end_input = in + w*h;
      const size_t blocked_loops = (w*h-2)/16;
      __m128i offs = _mm_set1_epi16( 128 );
      for( size_t i=0; i != blocked_loops; i++ ) {
        __m128i result_register_lo;
        __m128i result_register_hi;
        __m128i i1_register;
        __m128i i2_register;
        
        i1_register        = _mm_loadu_si128( (__m128i*)( i1 ) );
        i2_register        = _mm_loadu_si128( (__m128i*)( i2 ) );
        result_register_lo = *i0;
        i1_register        = _mm_add_epi16( i1_register, i1_register );
        result_register_lo = _mm_add_epi16( i1_register, result_register_lo );
        result_register_lo = _mm_add_epi16( i2_register, result_register_lo );
        result_register_lo = _mm_srai_epi16( result_register_lo, 2 );
        result_register_lo = _mm_add_epi16( result_register_lo, offs );

        i0++;
        i1+=8;
        i2+=8;

        i1_register        = _mm_loadu_si128( (__m128i*)( i1 ) );
        i2_register        = _mm_loadu_si128( (__m128i*)( i2 ) );
        result_register_hi = *i0;
        i1_register        = _mm_add_epi16( i1_register, i1_register );
        result_register_hi = _mm_add_epi16( i1_register, result_register_hi );
        result_register_hi = _mm_add_epi16( i2_register, result_register_hi );
        result_register_hi = _mm_srai_epi16( result_register_hi, 2 );
        result_register_hi = _mm_add_epi16( result_register_hi, offs );

        i0++;
        i1+=8;
        i2+=8;

        pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
        _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
      
        result += 16;
      }
    }

sobel算子的矩阵形式

$Gx=\begin{bmatrix} 1 & 0 & -1\\ 2 & 0 & -2\\ 1 & 0 & -1 \end{bmatrix} Gy=\begin{bmatrix} 1 & 2 & 1\\ 0 & 0 & 0\\ -1 & -2 & -1 \end{bmatrix}$

Gx相当于先给图像块做列卷积 $\begin{bmatrix} 1\\ 2\\ 1 \end{bmatrix}$ ,再进行行卷积 $\begin{bmatrix} 1 & 0 & -1 \end{bmatrix}$
Gy相当于先给图像块做列卷积 $\begin{bmatrix} 1\\ 0\\ -1 \end{bmatrix}$ ,再进行行卷积 $\begin{bmatrix} 1 & 2& 1 \end{bmatrix}$

elas源码赏析（二）sobel算子3*3行列分解快速卷积