Optimized code of guided filter (reduce by 20 percent)

t-taniai · Oct 13, 2017 · 3ae369c · 3ae369c
1 parent ff87db6
commit 3ae369c
Show file tree

Hide file tree

Showing 8 changed files with 166 additions and 35 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,5 +11,5 @@ x64/Release/LocalExpansionStereo.iobj
 *.idb
 LocalExpansionStereo/results/
 results/
-data/MiddV3/Adirondack/
-data/MiddV3/Vintage/
+data/MiddV3/trainingH/Adirondack
+data/MiddV3/trainingH/Vintage
diff --git a/LocalExpansionStereo/CostVolumeEnergy.h b/LocalExpansionStereo/CostVolumeEnergy.h
@@ -66,8 +66,13 @@ class CostVolumeEnergy :
 		}
 		else if (params.filterName == "GF")
 		{
-			filter[0] = std::make_unique<FastGuidedImageFilter<double>>(imL, params.windR / 2, params.filter_param1, 1.0/255);
-			filter[1] = std::make_unique<FastGuidedImageFilter<double>>(imR, params.windR / 2, params.filter_param1, 1.0/255);
+			filter[0] = std::make_unique<FastGuidedImageFilter<double>>(imL, params.windR / 2, params.filter_param1, 1.0 / 255);
+			filter[1] = std::make_unique<FastGuidedImageFilter<double>>(imR, params.windR / 2, params.filter_param1, 1.0 / 255);
+		}
+		else if (params.filterName == "GFfloat")
+		{
+			filter[0] = std::make_unique<FastGuidedImageFilter<float>>(imL, params.windR / 2, params.filter_param1, 1.0 / 255);
+			filter[1] = std::make_unique<FastGuidedImageFilter<float>>(imR, params.windR / 2, params.filter_param1, 1.0 / 255);
 		}
 		else //if (params.filterName == "")
 		{

diff --git a/LocalExpansionStereo/GuidedFilter.h b/LocalExpansionStereo/GuidedFilter.h
@@ -3,7 +3,6 @@
 // OpenCV
 #include <opencv2/opencv.hpp>
 #include <memory>
-//#include "Utilities.hpp"
 
 
 class IJointFilter
@@ -38,10 +37,10 @@ class GuidedImageFilter : public IJointFilter
 	cv::Mat N;
 	static const int DEPTH = cv::DataDepth<Type>::value;
 
-	cv::Mat boxfilter(cv::Mat I, int r) const
+	cv::Mat boxfilter(const cv::Mat& I) const
 	{
 		cv::Mat q;
-		cv::boxFilter(I, q, -1, cv::Size(2 * r + 1, 2 * r + 1), cv::Point(-1, -1), false, cv::BORDER_CONSTANT);
+		cv::boxFilter(I, q, -1, cv::Size(2 * R + 1, 2 * R + 1), cv::Point(-1, -1), false, cv::BORDER_CONSTANT);
 		return q;
 	}
 
@@ -67,22 +66,22 @@ class GuidedImageFilter : public IJointFilter
 
 		cv::split(realI, Ichannels);
 
-		N = boxfilter(cv::Mat_<Type>::ones(realI.size()), R);
-		mean_I_r = boxfilter(Ichannels[0], R) / N;
-		mean_I_g = boxfilter(Ichannels[1], R) / N;
-		mean_I_b = boxfilter(Ichannels[2], R) / N;
+		N = boxfilter(cv::Mat_<Type>::ones(realI.size()));
+		mean_I_r = boxfilter(Ichannels[0]) / N;
+		mean_I_g = boxfilter(Ichannels[1]) / N;
+		mean_I_b = boxfilter(Ichannels[2]) / N;
 
 		// variance of I in each local patch: the matrix Sigma in Eqn (14).
 		// Note the variance in each local patch is a 3x3 symmetric matrix:
 		//           rr, rg, rb
 		//   Sigma = rg, gg, gb
 		//           rb, gb, bb
-		cv::Mat var_I_rr = boxfilter(Ichannels[0].mul(Ichannels[0]), R) / N - mean_I_r.mul(mean_I_r) + eps;
-		cv::Mat var_I_rg = boxfilter(Ichannels[0].mul(Ichannels[1]), R) / N - mean_I_r.mul(mean_I_g);
-		cv::Mat var_I_rb = boxfilter(Ichannels[0].mul(Ichannels[2]), R) / N - mean_I_r.mul(mean_I_b);
-		cv::Mat var_I_gg = boxfilter(Ichannels[1].mul(Ichannels[1]), R) / N - mean_I_g.mul(mean_I_g) + eps;
-		cv::Mat var_I_gb = boxfilter(Ichannels[1].mul(Ichannels[2]), R) / N - mean_I_g.mul(mean_I_b);
-		cv::Mat var_I_bb = boxfilter(Ichannels[2].mul(Ichannels[2]), R) / N - mean_I_b.mul(mean_I_b) + eps;
+		cv::Mat var_I_rr = boxfilter(Ichannels[0].mul(Ichannels[0])) / N - mean_I_r.mul(mean_I_r) + eps;
+		cv::Mat var_I_rg = boxfilter(Ichannels[0].mul(Ichannels[1])) / N - mean_I_r.mul(mean_I_g);
+		cv::Mat var_I_rb = boxfilter(Ichannels[0].mul(Ichannels[2])) / N - mean_I_r.mul(mean_I_b);
+		cv::Mat var_I_gg = boxfilter(Ichannels[1].mul(Ichannels[1])) / N - mean_I_g.mul(mean_I_g) + eps;
+		cv::Mat var_I_gb = boxfilter(Ichannels[1].mul(Ichannels[2])) / N - mean_I_g.mul(mean_I_b);
+		cv::Mat var_I_bb = boxfilter(Ichannels[2].mul(Ichannels[2])) / N - mean_I_b.mul(mean_I_b) + eps;
 
 		// Inverse of Sigma + eps * I
 		invrr = var_I_gg.mul(var_I_bb) - var_I_gb.mul(var_I_gb);
@@ -108,17 +107,13 @@ class GuidedImageFilter : public IJointFilter
 		return std::make_shared<GuidedImageFilter>(realI(rect), R, eps);
 	}
 
-	cv::Mat filter(const cv::Mat& _p) const override
+	cv::Mat filter_mat(const cv::Mat& p) const
 	{
-		cv::Mat p;
-		if (_p.depth() != DEPTH) _p.convertTo(p, DEPTH);
-		else p = _p;
-
-		cv::Mat mean_p = boxfilter(p, R) / N;
+		cv::Mat mean_p = boxfilter(p) / N;
 
-		cv::Mat mean_Ip_r = boxfilter(Ichannels[0].mul(p), R) / N;
-		cv::Mat mean_Ip_g = boxfilter(Ichannels[1].mul(p), R) / N;
-		cv::Mat mean_Ip_b = boxfilter(Ichannels[2].mul(p), R) / N;
+		cv::Mat mean_Ip_r = boxfilter(Ichannels[0].mul(p)) / N;
+		cv::Mat mean_Ip_g = boxfilter(Ichannels[1].mul(p)) / N;
+		cv::Mat mean_Ip_b = boxfilter(Ichannels[2].mul(p)) / N;
 
 		// covariance of (I, p) in each local patch.
 		cv::Mat cov_Ip_r = mean_Ip_r - mean_I_r.mul(mean_p);
@@ -132,10 +127,133 @@ class GuidedImageFilter : public IJointFilter
 		cv::Mat b = mean_p - a_r.mul(mean_I_r) - a_g.mul(mean_I_g) - a_b.mul(mean_I_b); // Eqn. (15) in the paper;
 
 		cv::Mat q =
-			( boxfilter(a_r, R).mul(Ichannels[0])
-			+ boxfilter(a_g, R).mul(Ichannels[1])
-			+ boxfilter(a_b, R).mul(Ichannels[2])
-			+ boxfilter(b, R)) / N;  // Eqn. (16) in the paper;
+			(boxfilter(a_r).mul(Ichannels[0])
+				+ boxfilter(a_g).mul(Ichannels[1])
+				+ boxfilter(a_b).mul(Ichannels[2])
+				+ boxfilter(b)) / N;  // Eqn. (16) in the paper;
+		return q;
+	}
+
+	// This code reduces redudant data access.
+	// Not explicitly vectorized but hopefully done by auto vectorization of the compiler.
+	// Benchmark for Adirondack:
+	//   Desktop) 498 sec -> 408 sec.(18% reduction)
+	//   Laptop)  302 sec -> 237 sec.(22% reduction)
+	cv::Mat filter_raw(const cv::Mat& p) const
+	{
+		int rows = p.rows, cols = p.cols;
+		cv::Mat mean_p = boxfilter(p);
+
+		cv::Mat mean_Ip_r(p.size(), p.depth());
+		cv::Mat mean_Ip_g(p.size(), p.depth());
+		cv::Mat mean_Ip_b(p.size(), p.depth());
+
+		for (int i = 0; i < rows; i++)
+		{
+			auto pp = p.ptr<Type>(i);
+			auto pmean_Ip_r = mean_Ip_r.ptr<Type>(i);
+			auto pmean_Ip_g = mean_Ip_g.ptr<Type>(i);
+			auto pmean_Ip_b = mean_Ip_b.ptr<Type>(i);
+
+			auto pI_r = Ichannels[0].ptr<Type>(i);
+			auto pI_g = Ichannels[1].ptr<Type>(i);
+			auto pI_b = Ichannels[2].ptr<Type>(i);
+
+			for (int j = 0; j < cols; j++)
+			{
+				auto vp = pp[j];
+				pmean_Ip_r[j] = pI_r[j] * vp;
+				pmean_Ip_g[j] = pI_g[j] * vp;
+				pmean_Ip_b[j] = pI_b[j] * vp;
+			}
+		}
+		mean_Ip_r = boxfilter(mean_Ip_r);
+		mean_Ip_g = boxfilter(mean_Ip_g);
+		mean_Ip_b = boxfilter(mean_Ip_b);
+
+
+		cv::Mat a_r(p.size(), p.depth());
+		cv::Mat a_g(p.size(), p.depth());
+		cv::Mat a_b(p.size(), p.depth());
+		cv::Mat b(p.size(), p.depth());
+
+		for (int i = 0; i < rows; i++)
+		{
+			auto pa_r = a_r.ptr<Type>(i);
+			auto pa_g = a_g.ptr<Type>(i);
+			auto pa_b = a_b.ptr<Type>(i);
+
+			auto pN = N.ptr<Type>(i);
+			auto pmean_p = mean_p.ptr<Type>(i);
+			auto pmean_Ip_r = mean_Ip_r.ptr<Type>(i);
+			auto pmean_Ip_g = mean_Ip_g.ptr<Type>(i);
+			auto pmean_Ip_b = mean_Ip_b.ptr<Type>(i);
+
+			auto pmean_I_r = mean_I_r.ptr<Type>(i);
+			auto pmean_I_g = mean_I_g.ptr<Type>(i);
+			auto pmean_I_b = mean_I_b.ptr<Type>(i);
+
+			auto pinvrr = invrr.ptr<Type>(i);
+			auto pinvrg = invrg.ptr<Type>(i);
+			auto pinvrb = invrb.ptr<Type>(i);
+			auto pinvgg = invgg.ptr<Type>(i);
+			auto pinvgb = invgb.ptr<Type>(i);
+			auto pinvbb = invbb.ptr<Type>(i);
+
+			auto pb = b.ptr<Type>(i);
+			for (int j = 0; j < cols; j++)
+			{
+				auto n = pN[j];
+				auto mp = pmean_p[j] / n;
+				auto mIr = pmean_I_r[j];
+				auto mIg = pmean_I_g[j];
+				auto mIb = pmean_I_b[j];
+
+				auto cov_Ip_r = pmean_Ip_r[j] / n - mIr*mp;
+				auto cov_Ip_g = pmean_Ip_g[j] / n - mIg*mp;
+				auto cov_Ip_b = pmean_Ip_b[j] / n - mIb*mp;
+
+				pa_r[j] = pinvrr[j] * cov_Ip_r + pinvrg[j] * cov_Ip_g + pinvrb[j] * cov_Ip_b;
+				pa_g[j] = pinvrg[j] * cov_Ip_r + pinvgg[j] * cov_Ip_g + pinvgb[j] * cov_Ip_b;
+				pa_b[j] = pinvrb[j] * cov_Ip_r + pinvgb[j] * cov_Ip_g + pinvbb[j] * cov_Ip_b;
+
+				pb[j] = mp - pa_r[j] * mIr - pa_g[j] * mIg - pa_b[j] * mIb;
+			}
+		}
+
+		a_r = boxfilter(a_r);
+		a_g = boxfilter(a_g);
+		a_b = boxfilter(a_b);
+		b = boxfilter(b);
+
+		for (int i = 0; i < rows; i++)
+		{
+			auto pa_r = a_r.ptr<Type>(i);
+			auto pa_g = a_g.ptr<Type>(i);
+			auto pa_b = a_b.ptr<Type>(i);
+			auto pb = b.ptr<Type>(i);
+			auto pN = N.ptr<Type>(i);
+
+			auto pI_r = Ichannels[0].ptr<Type>(i);
+			auto pI_g = Ichannels[1].ptr<Type>(i);
+			auto pI_b = Ichannels[2].ptr<Type>(i);
+
+			for (int j = 0; j < cols; j++)
+			{
+				pb[j] = (pb[j] + pa_r[j] * pI_r[j] + pa_g[j] * pI_g[j] + pa_b[j] * pI_b[j]) / pN[j];
+			}
+		}
+		return b;
+	}
+	cv::Mat filter(const cv::Mat& _p) const override
+	{
+		cv::Mat p;
+		if (_p.depth() != DEPTH) _p.convertTo(p, DEPTH);
+		else p = _p;
+
+		// This code is the largest bottleneck of the while algorithm.
+		cv::Mat q = filter_raw(p);
+		//cv::Mat q = filter_mat(p);
 
 		cv::Mat _q;
 
@@ -185,6 +303,7 @@ class FastGuidedImageFilter : public GuidedImageFilter<Type>
 		auto filter = std::make_shared<FastGuidedImageFilter>();
 		filter->R = R;
 		filter->eps = eps;
+
 		filter->I = I(rect);
 		filter->realI = realI(rect);
 		filter->mean_I_r = mean_I_r(rect);
@@ -202,7 +321,7 @@ class FastGuidedImageFilter : public GuidedImageFilter<Type>
 		filter->invgb = invgb(rect);
 		filter->invbb = invbb(rect);
 
-		filter->N = boxfilter(cv::Mat_<Type>::ones(rect.size()), R);
+		filter->N = boxfilter(cv::Mat_<Type>::ones(rect.size()));
 		return filter;
 	}
 };

diff --git a/LocalExpansionStereo/StereoEnergy.h b/LocalExpansionStereo/StereoEnergy.h
@@ -22,7 +22,7 @@ struct Parameters
 	float filter_param1;
 	int windR;
 	int neighborNum;
-	std::string filterName; // "BF" or "GF" or ""
+	std::string filterName; // "BF" or "GF" or "GFfloat" or ""
 
 	Parameters(float lambda = 20, int windR = 20, std::string filterName = "BF", float filter_param1 = 10)
 		: alpha(0.9)
@@ -566,6 +566,11 @@ class NaiveStereoEnergy : public StereoEnergy
 			filter[0] = std::make_unique<FastGuidedImageFilter<double>>(imL, params.windR / 2, params.filter_param1, 1.0 / 255);
 			filter[1] = std::make_unique<FastGuidedImageFilter<double>>(imR, params.windR / 2, params.filter_param1, 1.0 / 255);
 		}
+		else if (params.filterName == "GFfloat")
+		{
+			filter[0] = std::make_unique<FastGuidedImageFilter<float>>(imL, params.windR / 2, params.filter_param1, 1.0 / 255);
+			filter[1] = std::make_unique<FastGuidedImageFilter<float>>(imR, params.windR / 2, params.filter_param1, 1.0 / 255);
+		}
 		else //if (params.filterName == "")
 		{
 			filter[0] = nullptr;

diff --git a/LocalExpansionStereo/main.cpp b/LocalExpansionStereo/main.cpp
@@ -71,6 +71,7 @@ struct Options
 
 const Parameters paramsBF = Parameters(20, 20, "BF", 10);
 const Parameters paramsGF = Parameters(1.0, 20, "GF", 0.0001);
+const Parameters paramsGFfloat = Parameters(1.0, 20, "GFfloat", 0.0001); // Slightly faster
 
 struct Calib
 {

diff --git a/data/MiddV3/Adirondack/README.txt b/data/MiddV3/Adirondack/README.txt
diff --git a/data/MiddV3/trainingH/README.txt b/data/MiddV3/trainingH/README.txt
@@ -0,0 +1,3 @@
+Download an example file set from
+http://www2.hci.iis.u-tokyo.ac.jp/datasets/data/LocalExpStereo/Adirondack.zip
+then extract it here as a directly "Adirondack"
diff --git a/demo.bat b/demo.bat
@@ -7,4 +7,4 @@ set resultsroot=%~dp0results
 mkdir "%resultsroot%"
 "%bin%" -targetDir "%datasetroot%\MiddV2\cones" -outputDir "%resultsroot%\cones" -mode MiddV2 -smooth_weight 1 -doDual 1
 "%bin%" -targetDir "%datasetroot%\MiddV2\teddy" -outputDir "%resultsroot%\teddy" -mode MiddV2 -smooth_weight 1
-"%bin%" -targetDir "%datasetroot%\MiddV3\Adirondack" -outputDir "%resultsroot%\Adirondack" -mode MiddV3 -smooth_weight 0.5
+"%bin%" -targetDir "%datasetroot%\MiddV3\trainingH\Adirondack" -outputDir "%resultsroot%\Adirondack" -mode MiddV3 -smooth_weight 0.5