CUDA编程：SSD的priorbox层

SSD中的pribox层的实现：

__global__ void PriKernel(float *top_data,const int layer_height,const int layer_width,const int img_height, const int img_width,const float step_w,const float step_h, const int offset, float *min_sizes,const int min_sizes_len,float*max_sizes,const int max_sizes_len,float *aspts,const int aspts_len, const int clip,const int num_priors)
{
   Dtype* top_data = top[0]->mutable_gpu_data();
  int dim = layer_height * layer_width * num_priors_ * 4;  // 一般情况下w*h*6*4
 
  int h = iThredNumber;
  int idx = h * layer_width * num_priors *4;
 // for (int h = 0; h < layer_height; ++h) {   // 对于feature map上的每个点逐一映射
    for (int w = 0; w < layer_width; ++w) {
      // 这里和Faster RCNN 一样，就是把feature map上的点映射回原图,这里加上0.5也是为了四舍五入，和faster rcnn python代码类似
      float center_x = (w + offset_) * step_w;   
      float center_y = (h + offset_) * step_h;
      float box_width, box_height;
      for (int s = 0; s < min_sizes_.size(); ++s) {  // min_sizes_.size()=1
        int min_size_ = min_sizes_[s]; 
        // 这里的min_size从fc7_mbox_priorbox的60到最后的276，就是s_k从0.2到0.92的过程
        // first prior: aspect_ratio = 1, size = min_size
        box_width = box_height = min_size_;  
        // xmin
        top_data[idx++] = (center_x - box_width / 2.) / img_width;    // 
        // ymin
        top_data[idx++] = (center_y - box_height / 2.) / img_height;
        // xmax
        top_data[idx++] = (center_x + box_width / 2.) / img_width;
        // ymax
        top_data[idx++] = (center_y + box_height / 2.) / img_height;
 
        if (max_sizes_.size() > 0) {
          CHECK_EQ(min_sizes_.size(), max_sizes_.size());
          int max_size_ = max_sizes_[s];
          // second prior: aspect_ratio = 1, size = sqrt(min_size * max_size)  // 这里就和论文中一致，s_k的选法，每个都不同
          box_width = box_height = sqrt(min_size_ * max_size_);
          // xmin
          top_data[idx++] = (center_x - box_width / 2.) / img_width;
          // ymin
          top_data[idx++] = (center_y - box_height / 2.) / img_height;
          // xmax
          top_data[idx++] = (center_x + box_width / 2.) / img_width;
          // ymax
          top_data[idx++] = (center_y + box_height / 2.) / img_height;
        }
 
        // rest of priors
        for (int r = 0; r < aspect_ratios_.size(); ++r) {  // 其他几个比例计算
          float ar = aspect_ratios_[r];
          if (fabs(ar - 1.) < 1e-6) {
            continue;
          }
          box_width = min_size_ * sqrt(ar);
          box_height = min_size_ / sqrt(ar);
          // xmin
          top_data[idx++] = (center_x - box_width / 2.) / img_width;
          // ymin
          top_data[idx++] = (center_y - box_height / 2.) / img_height;
          // xmax
          top_data[idx++] = (center_x + box_width / 2.) / img_width;
          // ymax
          top_data[idx++] = (center_y + box_height / 2.) / img_height;
        }
      }  // end for min_size=1
    }  // end for w
  //}  // end for h
  // 到这里，所有的prior_box选取完成，共6个比例，和论文中相符合，同时在每一层中算一个s_k,就是每一层都会设置一个min_size
  // clip the prior's coordidate such that it is within [0, 1]
  if (clip_) {                        // 裁剪到[0,1]
    int idx2 = h * layer_width * num_priors *4;
    for (int d = 0; d < layer_width * num_priors *4; ++d) {
      top_data[idx2 + d] = std::min(std::max(top_data[idx2+d], 0.), 1.);
    }
  }
}

并行方差，适当的修改一下参数就可以实现GPU的并行；

__global__ void PriKernel2(float *top_data,const int layer_height,const int layer_width,
float *variance,const int variance_len,const int num_priors)
{
  
    int count = iThredNum * layer_width * num_priors *4;
   // for (int h = 0; h < layer_height; ++h) {
      for (int w = 0; w < layer_width; ++w) {
        for (int i = 0; i < num_priors_; ++i) {
          for (int j = 0; j < 4; ++j) {
            top_data[count] = variance_[j];
            ++count;
          }
        }
      }
   // }

}

第三步，就是把本地的数据copy到GPU上，caffe中是如果实现GPU的代码，默认使用GPU的代码；

CUDA编程：SSD的priorbox层

猜你喜欢