原理可以参看:白马负金羁的《导向滤波(Guided Filter)的解析与实现》 ,写得很清晰透彻
我实现的效果如下
原图(894*1080,由于笔记本屏幕不够大,截图有截断)
单通道的导向滤波
1.CPU 版的 Guided Filter 和 Fast Guided Filter (缩放倍数为 2,何凯明大神加了一个 resize 就成了新算法,大神就是大神!) 的耗时(循环 100 次取平均):
2.基于 OpenCV CUDA 编译库实现的 Guided Filter (实现了多流覆盖)的耗时
可见 OpenCV 实现的 CUDA 函数其实性能也是有很大优化空间的。由于 GPU 开辟设备内存相当耗时,一般 GPU 内存是重用的(比如视频流),上图的时间是 Guided Filter 循环 100 次计算的平均值。
3. CUDA 加速的 Guided Filter 耗时:
同上,所以内存开辟的时间只记录一次,Guided Filter 是循环 100 次计算的平均值。可见设备内存开辟的时间还是很多的,所以如果只处理少量图片,GPU 并不占优势,适合采用 CPU 版的 Fast Guided Filter 。
如果引导图像和原图是同一张图的话,还可以尽一步减少计算时间,避免重复计算。
部分代码如下:
int main()
{
if (cuda::getCudaEnabledDeviceCount() == 0)
{
cerr << "此OpenCV编译的时候没有启用CUDA模块" << endl;
return -1;
}
cv::Mat srcI = cv::imread("./lady.jpg", 0);
cv::Mat srcP = cv::imread("./lady.jpg", 0);
const int cols = 1080.f / srcI.rows * srcI.cols;
const int rows = 1080;
cv::Mat origI, origP;
cv::resize(srcI, origI, cv::Size(cols, rows));
cv::resize(srcP, origP, cv::Size(cols, rows));
Timer t;
cudaStream_t stream[5];
for (int i = 0; i < 5; ++i)
cudaStreamCreate(&stream[i]);
// I
cuda::GpuMat devOrigI(rows, cols, CV_8UC1);
cuda::GpuMat devIntegralI(rows, cols, CV_32FC1);
cuda::GpuMat devCountI(rows, cols, CV_32SC1, cv::Scalar(1));
cuda::GpuMat devMeanI(rows, cols, CV_32FC1);
// P
cuda::GpuMat devOrigP(rows, cols, CV_8UC1);
cuda::GpuMat devIntegralP(rows, cols, CV_32FC1);
cuda::GpuMat devMeanP(rows, cols, CV_32FC1);
// I*I
cuda::GpuMat devIntegralII(rows, cols, CV_32FC1);
cuda::GpuMat devMeanII(rows, cols, CV_32FC1);
// P*P
cuda::GpuMat devIntegralPP(rows, cols, CV_32FC1);
cuda::GpuMat devMeanPP(rows, cols, CV_32FC1);
// I*P
cuda::GpuMat devIntegralIP(rows, cols, CV_32FC1);
cuda::GpuMat devMeanIP(rows, cols, CV_32FC1);
cuda::GpuMat devA(rows, cols, CV_32FC1);
cuda::GpuMat devMeanA(rows, cols, CV_32FC1);
cuda::GpuMat devB(rows, cols, CV_32FC1);
cuda::GpuMat devMeanB(rows, cols, CV_32FC1);
std::cout << "Mem init(ms): " << t.elapsed() << std::endl;
const int N = 100; // 100
int count = N;
const int radius = 2 * 10 + 1; // 窗口半径
const float eps = 1e-2;
t.restart();
do
{
devOrigI.upload(origI); // 可以考虑异步
devOrigP.upload(origP);
devOrigI.convertTo(devIntegralI, CV_32FC1, 1.f / 255);
devOrigP.convertTo(devIntegralP, CV_32FC1, 1.f / 255);
devCountI.setTo(cv::Scalar(1)); // 要是没有循环可以删去
// ----------------------------------------------------------------
// I*I P*P I*P
mul(devIntegralI, devIntegralI, devIntegralII, stream[0]);
mul(devIntegralP, devIntegralP, devIntegralPP, stream[1]);
mul(devIntegralI, devIntegralP, devIntegralIP, stream[2]);
//cudaStreamSynchronize()//同步单个流:等待该流上的命令都完成
//cudaDeviceSynchronize()//同步所有流同步:等待整个设备上流都完成
checkCudaErrors(cudaStreamSynchronize(stream[0]));
checkCudaErrors(cudaStreamSynchronize(stream[1]));
checkCudaErrors(cudaStreamSynchronize(stream[2]));
// ----------------------------------------------------------------
// I P I*I I*P P*P 的积分图
rowAdd(devIntegralI, devCountI, stream[0]);
rowAdd(devIntegralP, stream[1]);
rowAdd(devIntegralII, stream[2]);
rowAdd(devIntegralIP, stream[3]);
rowAdd(devIntegralPP, stream[4]);
checkCudaErrors(cudaStreamSynchronize(stream[0]));
checkCudaErrors(cudaStreamSynchronize(stream[1]));
checkCudaErrors(cudaStreamSynchronize(stream[2]));
checkCudaErrors(cudaStreamSynchronize(stream[3]));
checkCudaErrors(cudaStreamSynchronize(stream[4]));
colAdd(devIntegralI, devCountI, stream[0]);
colAdd(devIntegralP, stream[1]);
colAdd(devIntegralII, stream[2]);
colAdd(devIntegralIP, stream[3]);
colAdd(devIntegralPP, stream[4]);
checkCudaErrors(cudaStreamSynchronize(stream[0]));
checkCudaErrors(cudaStreamSynchronize(stream[1]));
checkCudaErrors(cudaStreamSynchronize(stream[2]));
checkCudaErrors(cudaStreamSynchronize(stream[3]));
checkCudaErrors(cudaStreamSynchronize(stream[4]));
// ----------------------------------------------------------------
// I P II IP PP 的均值
boxFilter<float>(devMeanI, devIntegralI, devCountI, radius, stream[0]);
boxFilter<float>(devMeanP, devIntegralP, devCountI, radius, stream[1]);
boxFilter<float>(devMeanII, devIntegralII, devCountI, radius, stream[2]);
boxFilter<float>(devMeanIP, devIntegralIP, devCountI, radius, stream[3]);
boxFilter<float>(devMeanPP, devIntegralPP, devCountI, radius, stream[4]);
checkCudaErrors(cudaStreamSynchronize(stream[0]));
checkCudaErrors(cudaStreamSynchronize(stream[1]));
checkCudaErrors(cudaStreamSynchronize(stream[2]));
checkCudaErrors(cudaStreamSynchronize(stream[3]));
checkCudaErrors(cudaStreamSynchronize(stream[4]));
// ----------------------------------------------------------------
// 计算 a, b
calcAB(devMeanI, devMeanP, devMeanII, devMeanIP, eps, devA, devB);
checkCudaErrors(cudaDeviceSynchronize());
// ----------------------------------------------------------------
// 计算 a, b 均值
rowAdd(devA, stream[0]);
rowAdd(devB, stream[1]);
checkCudaErrors(cudaStreamSynchronize(stream[0]));
checkCudaErrors(cudaStreamSynchronize(stream[1]));
colAdd(devA, stream[0]);
colAdd(devB, stream[1]);
checkCudaErrors(cudaStreamSynchronize(stream[0]));
checkCudaErrors(cudaStreamSynchronize(stream[1]));
boxFilter<float>(devMeanA, devA, devCountI, radius, stream[0]);
boxFilter<float>(devMeanB, devB, devCountI, radius, stream[1]);
checkCudaErrors(cudaStreamSynchronize(stream[0]));
checkCudaErrors(cudaStreamSynchronize(stream[1]));
// ----------------------------------------------------------------
// a * I + b
calcAMultiplyIPlusB(devMeanA, devMeanB, devOrigI, devOrigP); // 把 devOrigP 重复利用当做输出
checkCudaErrors(cudaDeviceSynchronize());
// ----------------------------------------------------------------
}
while (count--);
// ----------------------------------------------------------------
std::cout << "Guided Filter(ms): " << t.elapsed() / (N + 1) << std::endl;
// 从显存把数据下载到内存
Mat retI, retP;
devOrigP.download(retI);
// 显示
imshow("ResultI", retI);
imshow("OrigI", origI);
waitKey(0);
return 0;
}
ps: 如需源码请私信