SVM中数据缩放(scale)源码理解

在进行svm训练的时候,会有一个参数 svm_scaling scaling,该参数实现对数据的归一化操作,在A Practical Guide to Support Vector Classification一文中有介绍,https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf,下面是对数据缩放以及加载(训练数据和测试数据进行同样的缩放)的源码实现进行汇总


//svm_wrapper.cpp
//生成缩放因子

void
pcl::SVMTrain::scaleFactors (std::vector<SVMData> training_set, svm_scaling &scaling)
{
  int max = 0;

  for (size_t i = 0; i < training_set.size() ; i++)
    for (size_t j = 0; j < training_set[i].SV.size() ; j++)
      if (training_set[i].SV[j].idx > max)
	max = training_set[i].SV[j].idx; // max number of features

  max += 1;

  scaling.obj = Malloc (struct svm_node, max + 1);
  scaling.max = max;
  scaling.obj[max].index = -1; // last index is -1

  for (int i = 0; i < max; i++) // Initialize values,scaling大小为最大特征向量
  {
    scaling.obj[i].index = 0;
    scaling.obj[i].value = 0;
  }

  for (size_t i = 0; i < training_set.size(); i++)
    for (size_t j = 0; j < training_set[i].SV.size(); j++)
      // save scaling factor finding the maximum value,保存每个idx位置对应的最大特征值为缩放尺度因子
      if (module (training_set[i].SV[j].value) > scaling.obj[ training_set[i].SV[j].idx ].value)
      {
	scaling.obj[ training_set[i].SV[j].idx ].index = 1;
	scaling.obj[ training_set[i].SV[j].idx ].value = module (training_set[i].SV[j].value);
      }
};

//svm.cpp
//加载缩放因子


if (res > 0 && strcmp (cmd, "scaling") == 0)
    {
      char *idx, *val, buff[10000];
      int ii = 0, pre_ii = 0;
      //char delims[]="\t: ";
      model->scaling = Malloc (struct svm_node, 1);
      res = fscanf (fp, "%10000[^\n]", buff);
      idx = strtok (buff, ":");

      while (idx != NULL)
      {
        val = strtok (NULL, " \t");
        pre_ii = ii;
        ii = atoi (idx);

        model->scaling = Realloc (model->scaling, struct svm_node, ii + 2);

        //setting to zero the non defined scaling factors

        for (int j = pre_ii + 1; j < ii; j++)//不连续的idx设置为0
        {
          model->scaling[j].index = 0;
          model->scaling[j].value = 0;
        }

        model->scaling[ii].index = 1;

        model->scaling[ii].value = atof (val);
        ++ii;
        idx = strtok (NULL, ":");
        //printf("%d e %f\n",model->scaling[ii-1].index,model->scaling[ii-1].value);
      }

      model->scaling[ii].index = -1;
    }


//svm_wrapper.cpp
//对测试数据进行与训练数据同样的缩放

void
pcl::SVMClassify::scaleProblem (svm_problem &input, svm_scaling scaling)
{
  assert (scaling.max != 0);

  for (int i = 0;i < input.l;i++)
  {
    int j = 0;

    while (1)
    {
      if (input.x[i][j].index == -1)
	break;

      if (input.x[i][j].index < scaling.max && scaling.obj[ input.x[i][j].index ].index == 1)
	input.x[i][j].value = input.x[i][j].value / scaling.obj[ input.x[i][j].index ].value;

      j++;
    }
  }
}

猜你喜欢

转载自blog.csdn.net/qq_25244255/article/details/86624157