diff options
author | Alan Kelly <alankelly-at-google.com@ffmpeg.org> | 2022-07-15 16:59:43 +0200 |
---|---|---|
committer | Anton Khirnov <anton@khirnov.net> | 2022-08-18 16:24:48 +0200 |
commit | a38293e4448c9389e604af9858984361a5677a20 (patch) | |
tree | 3ef0779d0e57789967b69edbd1d3dc187e529762 /libswscale | |
parent | a6724285fd45111436dd5242eab2c489182aa5c2 (diff) | |
download | ffmpeg-a38293e4448c9389e604af9858984361a5677a20.tar.gz |
libswscale: Enable hscale_avx2 for all input sizes.
ff_shuffle_filter_coefficients shuffles the tail as required.
Signed-off-by: Anton Khirnov <anton@khirnov.net>
Diffstat (limited to 'libswscale')
-rw-r--r-- | libswscale/utils.c | 19 | ||||
-rw-r--r-- | libswscale/x86/swscale.c | 6 |
2 files changed, 18 insertions, 7 deletions
diff --git a/libswscale/utils.c b/libswscale/utils.c index 34503e57f4..baa1791ebe 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -268,8 +268,7 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, #if ARCH_X86_64 int i, j, k; int cpu_flags = av_get_cpu_flags(); - // avx2 hscale filter processes 16 pixel blocks. - if (!filter || dstW % 16 != 0) + if (!filter) return 0; if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { @@ -281,9 +280,11 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, } // Do not swap filterPos for pixels which won't be processed by // the main loop. - for (i = 0; i + 8 <= dstW; i += 8) { + for (i = 0; i + 16 <= dstW; i += 16) { FFSWAP(int, filterPos[i + 2], filterPos[i + 4]); FFSWAP(int, filterPos[i + 3], filterPos[i + 5]); + FFSWAP(int, filterPos[i + 10], filterPos[i + 12]); + FFSWAP(int, filterPos[i + 11], filterPos[i + 13]); } if (filterSize > 4) { // 16 pixels are processed at a time. @@ -297,6 +298,18 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, } } } + // 4 pixels are processed at a time in the tail. + for (; i < dstW; i += 4) { + // 4 filter coeffs are processed at a time. + int rem = dstW - i >= 4 ? 4 : dstW - i; + for (k = 0; k + 4 <= filterSize; k += 4) { + for (j = 0; j < rem; ++j) { + int from = (i + j) * filterSize + k; + int to = i * filterSize + j * 4 + k * 4; + memcpy(&filter[to], &filterCopy[from], 4 * sizeof(int16_t)); + } + } + } } av_free(filterCopy); } diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 89ef9f5d2b..ec1ca0e01c 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -625,10 +625,8 @@ switch(c->dstBpc){ \ if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { - if (c->chrDstW % 16 == 0) - ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); - if (c->dstW % 16 == 0) - ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); + ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); + ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); } } |