swscale: move yuv2yuvX_sse3 to yasm, unrolls main loop

And other small optimizations for ~20% speedup.
author: Alan Kelly <alankelly-at-google.com@ffmpeg.org> 2021-01-14 15:47:03 +0100
committer: Paul B Mahol <onemda@gmail.com> 2021-02-17 21:21:03 +0100
commit: 554c2bc7086f49ef5a6a989ad6bc4bc11807eb6f (patch)
tree: 9c1fcb5685879932cf1b0637571e4d13ede38884 /tests/checkasm
parent: 1628409b18fb932b1ee0585151a82e67931cf43c (diff)
download: ffmpeg-554c2bc7086f49ef5a6a989ad6bc4bc11807eb6f.tar.gz
1 files changed, 103 insertions, 0 deletions
diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index 8741b3943c..6a95546ba6 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -36,6 +36,107 @@
             AV_WN32(buf + j, rnd());      \
     } while (0)
 
+// This reference function is the same approximate algorithm employed by the
+// SIMD functions
+static void ref_function(const int16_t *filter, int filterSize,
+                                                 const int16_t **src, uint8_t *dest, int dstW,
+                                                 const uint8_t *dither, int offset)
+{
+    int i, d;
+    d = ((filterSize - 1) * 8 + dither[0]) >> 4;
+    for ( i = 0; i < dstW; i++) {
+        int16_t val = d;
+        int j;
+        union {
+            int val;
+            int16_t v[2];
+        } t;
+        for (j = 0; j < filterSize; j++){
+            t.val = (int)src[j][i + offset] * (int)filter[j];
+            val += t.v[1];
+        }
+        dest[i]= av_clip_uint8(val>>3);
+    }
+}
+
+static void check_yuv2yuvX(void)
+{
+    struct SwsContext *ctx;
+    int fsi, osi, isi, i, j;
+    int dstW;
+#define LARGEST_FILTER 16
+#define FILTER_SIZES 4
+    static const int filter_sizes[FILTER_SIZES] = {1, 4, 8, 16};
+#define LARGEST_INPUT_SIZE 512
+#define INPUT_SIZES 4
+    static const int input_sizes[INPUT_SIZES] = {128, 144, 256, 512};
+
+    declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *filter,
+                      int filterSize, const int16_t **src, uint8_t *dest,
+                      int dstW, const uint8_t *dither, int offset);
+
+    const int16_t **src;
+    LOCAL_ALIGNED_8(int16_t, src_pixels, [LARGEST_FILTER * LARGEST_INPUT_SIZE]);
+    LOCAL_ALIGNED_8(int16_t, filter_coeff, [LARGEST_FILTER]);
+    LOCAL_ALIGNED_8(uint8_t, dst0, [LARGEST_INPUT_SIZE]);
+    LOCAL_ALIGNED_8(uint8_t, dst1, [LARGEST_INPUT_SIZE]);
+    LOCAL_ALIGNED_8(uint8_t, dither, [LARGEST_INPUT_SIZE]);
+    union VFilterData{
+        const int16_t *src;
+        uint16_t coeff[8];
+    } *vFilterData;
+    uint8_t d_val = rnd();
+    randomize_buffers(filter_coeff, LARGEST_FILTER);
+    randomize_buffers(src_pixels, LARGEST_FILTER * LARGEST_INPUT_SIZE);
+    ctx = sws_alloc_context();
+    if (sws_init_context(ctx, NULL, NULL) < 0)
+        fail();
+
+    ff_getSwsFunc(ctx);
+    for(i = 0; i < LARGEST_INPUT_SIZE; ++i){
+        dither[i] = d_val;
+    }
+    for(isi = 0; isi < INPUT_SIZES; ++isi){
+        dstW = input_sizes[isi];
+        for(osi = 0; osi < 64; osi += 16){
+            for(fsi = 0; fsi < FILTER_SIZES; ++fsi){
+                src = av_malloc(sizeof(int16_t*) * filter_sizes[fsi]);
+                vFilterData = av_malloc((filter_sizes[fsi] + 2) * sizeof(union VFilterData));
+                memset(vFilterData, 0, (filter_sizes[fsi] + 2) * sizeof(union VFilterData));
+                for(i = 0; i < filter_sizes[fsi]; ++i){
+                    src[i] = &src_pixels[i * LARGEST_INPUT_SIZE];
+                    vFilterData[i].src = src[i];
+                    for(j = 0; j < 4; ++j)
+                        vFilterData[i].coeff[j + 4] = filter_coeff[i];
+                }
+                if (check_func(ctx->yuv2planeX, "yuv2yuvX_%d_%d", filter_sizes[fsi], osi)){
+                    memset(dst0, 0, LARGEST_INPUT_SIZE * sizeof(dst0[0]));
+                    memset(dst1, 0, LARGEST_INPUT_SIZE * sizeof(dst1[0]));
+
+                    // The reference function is not the scalar function selected when mmx
+                    // is deactivated as the SIMD functions do not give the same result as
+                    // the scalar ones due to rounding. The SIMD functions are activated by
+                    // the flag SWS_ACCURATE_RND
+                    ref_function(&filter_coeff[0], filter_sizes[fsi], src, dst0, dstW - osi, dither, osi);
+                    // There's no point in calling new for the reference function
+                    if(ctx->use_mmx_vfilter){
+                        call_new((const int16_t*)vFilterData, filter_sizes[fsi], src, dst1, dstW - osi, dither, osi);
+                        if (memcmp(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0])))
+                            fail();
+                        if(dstW == LARGEST_INPUT_SIZE)
+                            bench_new((const int16_t*)vFilterData, filter_sizes[fsi], src, dst1, dstW - osi, dither, osi);
+                    }
+                }
+                free(src);
+                free(vFilterData);
+            }
+        }
+    }
+    sws_freeContext(ctx);
+#undef FILTER_SIZES
+}
+
+#undef SRC_PIXELS
 #define SRC_PIXELS 128
 
 static void check_hscale(void)
@@ -132,4 +233,6 @@ void checkasm_check_sw_scale(void)
 {
     check_hscale();
     report("hscale");
+    check_yuv2yuvX();
+    report("yuv2yuvX");
 }
author	Alan Kelly <alankelly-at-google.com@ffmpeg.org>	2021-01-14 15:47:03 +0100
committer	Paul B Mahol <onemda@gmail.com>	2021-02-17 21:21:03 +0100
commit	554c2bc7086f49ef5a6a989ad6bc4bc11807eb6f (patch)
tree	9c1fcb5685879932cf1b0637571e4d13ede38884 /tests/checkasm
parent	1628409b18fb932b1ee0585151a82e67931cf43c (diff)
download	ffmpeg-554c2bc7086f49ef5a6a989ad6bc4bc11807eb6f.tar.gz