lavfi/nlmeans: make compute_safe_ssd_integral_image_c faster

before: ssd_integral_image_c: 49204.6 after: ssd_integral_image_c: 44272.8 Unrolling by 4 made the biggest difference on odroid-c2 (aarch64); unrolling by 2 or 8 both raised 46k cycles vs 44k for 4. Additionally, this is a much better reference when writing SIMD (SIMD vectorization will just target 16 instead of 4).
author: Clément Bœsch <u@pkh.me> 2018-05-06 12:34:54 +0200
committer: Clément Bœsch <u@pkh.me> 2018-05-08 10:28:06 +0200
commit: 43d16aef6395340c248ee79d35f60f9965427a45 (patch)
tree: c844c123134ec0e83d695ce4e8e4e7998a8051c6 /libavfilter/vf_nlmeans.c
parent: f679711c1b516786a39f9e582622a200502fff74 (diff)
download: ffmpeg-43d16aef6395340c248ee79d35f60f9965427a45.tar.gz
1 files changed, 17 insertions, 10 deletions
diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index c30e44498f..f37f1183f7 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -146,10 +146,6 @@ static inline int get_integral_patch_value(const uint32_t *ii, int ii_lz_32, int
  * function, we do not need any clipping here.
  *
  * The line above dst and the column to its left are always readable.
- *
- * This C version computes the SSD integral image using a scalar accumulator,
- * while for SIMD implementation it is likely more interesting to use the
- * two-loops algorithm variant.
  */
 static void compute_safe_ssd_integral_image_c(uint32_t *dst, ptrdiff_t dst_linesize_32,
                                               const uint8_t *s1, ptrdiff_t linesize1,
@@ -157,21 +153,32 @@ static void compute_safe_ssd_integral_image_c(uint32_t *dst, ptrdiff_t dst_lines
                                               int w, int h)
 {
     int x, y;
+    const uint32_t *dst_top = dst - dst_linesize_32;
 
     /* SIMD-friendly assumptions allowed here */
     av_assert2(!(w & 0xf) && w >= 16 && h >= 1);
 
     for (y = 0; y < h; y++) {
-        uint32_t acc = dst[-1] - dst[-dst_linesize_32 - 1];
-
-        for (x = 0; x < w; x++) {
-            const int d  = s1[x] - s2[x];
-            acc += d * d;
-            dst[x] = dst[-dst_linesize_32 + x] + acc;
+        for (x = 0; x < w; x += 4) {
+            const int d0 = s1[x    ] - s2[x    ];
+            const int d1 = s1[x + 1] - s2[x + 1];
+            const int d2 = s1[x + 2] - s2[x + 2];
+            const int d3 = s1[x + 3] - s2[x + 3];
+
+            dst[x    ] = dst_top[x    ] - dst_top[x - 1] + d0*d0;
+            dst[x + 1] = dst_top[x + 1] - dst_top[x    ] + d1*d1;
+            dst[x + 2] = dst_top[x + 2] - dst_top[x + 1] + d2*d2;
+            dst[x + 3] = dst_top[x + 3] - dst_top[x + 2] + d3*d3;
+
+            dst[x    ] += dst[x - 1];
+            dst[x + 1] += dst[x    ];
+            dst[x + 2] += dst[x + 1];
+            dst[x + 3] += dst[x + 2];
         }
         s1  += linesize1;
         s2  += linesize2;
         dst += dst_linesize_32;
+        dst_top += dst_linesize_32;
     }
 }
author	Clément Bœsch <u@pkh.me>	2018-05-06 12:34:54 +0200
committer	Clément Bœsch <u@pkh.me>	2018-05-08 10:28:06 +0200
commit	43d16aef6395340c248ee79d35f60f9965427a45 (patch)
tree	c844c123134ec0e83d695ce4e8e4e7998a8051c6 /libavfilter/vf_nlmeans.c
parent	f679711c1b516786a39f9e582622a200502fff74 (diff)
download	ffmpeg-43d16aef6395340c248ee79d35f60f9965427a45.tar.gz