diff options
author | Andreas Rheinhardt <[email protected]> | 2025-09-29 19:51:47 +0200 |
---|---|---|
committer | Andreas Rheinhardt <[email protected]> | 2025-10-04 07:06:33 +0200 |
commit | cd077e88d1ba20418a81187159f14ee74f4a6f64 (patch) | |
tree | 34ce5a35a4ae1c9569785629f160e48c8e3b662a | |
parent | 4880fa4dcaa8e38f6b99aaf4f771fd91b78de732 (diff) |
avcodec/x86/h264_qpel: Add ff_{avg,put}_h264_qpel16_h_lowpass_l2_sse2()
These functions are currently emulated via four calls to the versions
for 8x8 blocks. In fact, the size savings from the simplified calls
in h264_qpel.c (GCC 1344B, Clang 1280B) more than outweigh the size
of the added functions (512B) here.
It is also beneficial performance-wise. Old benchmarks:
avg_h264_qpel_16_mc11_8_c: 1414.1 ( 1.00x)
avg_h264_qpel_16_mc11_8_sse2: 206.2 ( 6.86x)
avg_h264_qpel_16_mc11_8_ssse3: 177.7 ( 7.96x)
avg_h264_qpel_16_mc13_8_c: 1417.0 ( 1.00x)
avg_h264_qpel_16_mc13_8_sse2: 207.4 ( 6.83x)
avg_h264_qpel_16_mc13_8_ssse3: 178.2 ( 7.95x)
avg_h264_qpel_16_mc21_8_c: 1632.8 ( 1.00x)
avg_h264_qpel_16_mc21_8_sse2: 349.3 ( 4.67x)
avg_h264_qpel_16_mc21_8_ssse3: 291.3 ( 5.60x)
avg_h264_qpel_16_mc23_8_c: 1640.2 ( 1.00x)
avg_h264_qpel_16_mc23_8_sse2: 351.3 ( 4.67x)
avg_h264_qpel_16_mc23_8_ssse3: 290.8 ( 5.64x)
avg_h264_qpel_16_mc31_8_c: 1411.7 ( 1.00x)
avg_h264_qpel_16_mc31_8_sse2: 203.4 ( 6.94x)
avg_h264_qpel_16_mc31_8_ssse3: 178.9 ( 7.89x)
avg_h264_qpel_16_mc33_8_c: 1409.7 ( 1.00x)
avg_h264_qpel_16_mc33_8_sse2: 204.6 ( 6.89x)
avg_h264_qpel_16_mc33_8_ssse3: 178.1 ( 7.92x)
put_h264_qpel_16_mc11_8_c: 1391.0 ( 1.00x)
put_h264_qpel_16_mc11_8_sse2: 197.4 ( 7.05x)
put_h264_qpel_16_mc11_8_ssse3: 176.1 ( 7.90x)
put_h264_qpel_16_mc13_8_c: 1395.9 ( 1.00x)
put_h264_qpel_16_mc13_8_sse2: 196.7 ( 7.10x)
put_h264_qpel_16_mc13_8_ssse3: 177.7 ( 7.85x)
put_h264_qpel_16_mc21_8_c: 1609.5 ( 1.00x)
put_h264_qpel_16_mc21_8_sse2: 341.1 ( 4.72x)
put_h264_qpel_16_mc21_8_ssse3: 289.2 ( 5.57x)
put_h264_qpel_16_mc23_8_c: 1604.0 ( 1.00x)
put_h264_qpel_16_mc23_8_sse2: 340.9 ( 4.71x)
put_h264_qpel_16_mc23_8_ssse3: 289.6 ( 5.54x)
put_h264_qpel_16_mc31_8_c: 1390.2 ( 1.00x)
put_h264_qpel_16_mc31_8_sse2: 194.6 ( 7.14x)
put_h264_qpel_16_mc31_8_ssse3: 176.4 ( 7.88x)
put_h264_qpel_16_mc33_8_c: 1400.4 ( 1.00x)
put_h264_qpel_16_mc33_8_sse2: 198.5 ( 7.06x)
put_h264_qpel_16_mc33_8_ssse3: 176.2 ( 7.95x)
New benchmarks:
avg_h264_qpel_16_mc11_8_c: 1413.3 ( 1.00x)
avg_h264_qpel_16_mc11_8_sse2: 171.8 ( 8.23x)
avg_h264_qpel_16_mc11_8_ssse3: 173.0 ( 8.17x)
avg_h264_qpel_16_mc13_8_c: 1423.2 ( 1.00x)
avg_h264_qpel_16_mc13_8_sse2: 172.0 ( 8.27x)
avg_h264_qpel_16_mc13_8_ssse3: 173.4 ( 8.21x)
avg_h264_qpel_16_mc21_8_c: 1641.3 ( 1.00x)
avg_h264_qpel_16_mc21_8_sse2: 322.1 ( 5.10x)
avg_h264_qpel_16_mc21_8_ssse3: 291.3 ( 5.63x)
avg_h264_qpel_16_mc23_8_c: 1629.1 ( 1.00x)
avg_h264_qpel_16_mc23_8_sse2: 323.0 ( 5.04x)
avg_h264_qpel_16_mc23_8_ssse3: 293.3 ( 5.55x)
avg_h264_qpel_16_mc31_8_c: 1409.2 ( 1.00x)
avg_h264_qpel_16_mc31_8_sse2: 172.0 ( 8.19x)
avg_h264_qpel_16_mc31_8_ssse3: 173.7 ( 8.11x)
avg_h264_qpel_16_mc33_8_c: 1402.5 ( 1.00x)
avg_h264_qpel_16_mc33_8_sse2: 172.5 ( 8.13x)
avg_h264_qpel_16_mc33_8_ssse3: 173.6 ( 8.08x)
put_h264_qpel_16_mc11_8_c: 1393.7 ( 1.00x)
put_h264_qpel_16_mc11_8_sse2: 170.4 ( 8.18x)
put_h264_qpel_16_mc11_8_ssse3: 178.2 ( 7.82x)
put_h264_qpel_16_mc13_8_c: 1398.0 ( 1.00x)
put_h264_qpel_16_mc13_8_sse2: 170.2 ( 8.21x)
put_h264_qpel_16_mc13_8_ssse3: 178.6 ( 7.83x)
put_h264_qpel_16_mc21_8_c: 1619.6 ( 1.00x)
put_h264_qpel_16_mc21_8_sse2: 320.6 ( 5.05x)
put_h264_qpel_16_mc21_8_ssse3: 297.2 ( 5.45x)
put_h264_qpel_16_mc23_8_c: 1617.4 ( 1.00x)
put_h264_qpel_16_mc23_8_sse2: 320.0 ( 5.05x)
put_h264_qpel_16_mc23_8_ssse3: 297.4 ( 5.44x)
put_h264_qpel_16_mc31_8_c: 1389.7 ( 1.00x)
put_h264_qpel_16_mc31_8_sse2: 169.9 ( 8.18x)
put_h264_qpel_16_mc31_8_ssse3: 178.1 ( 7.80x)
put_h264_qpel_16_mc33_8_c: 1394.0 ( 1.00x)
put_h264_qpel_16_mc33_8_sse2: 170.9 ( 8.16x)
put_h264_qpel_16_mc33_8_ssse3: 176.9 ( 7.88x)
Notice that the SSSE3 versions of mc21 and mc23 benefit from
an optimized version of hv2_lowpass.
Also notice that there is no SSE2 version of the purely horizontal
motion compensation. This means that src2 is currently always aligned
when calling the SSE2 functions (and that srcStride is always equal
to the block width). Yet this has not been exploited (yet).
Reviewed-by: James Almer <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
-rw-r--r-- | libavcodec/x86/h264_qpel.c | 4 | ||||
-rw-r--r-- | libavcodec/x86/h264_qpel_8bit.asm | 74 |
2 files changed, 75 insertions, 3 deletions
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index c49a866c5d..75caac8805 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -59,6 +59,7 @@ void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t * void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\ void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\ void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_sse2(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\ +void ff_ ## OPNAME ## _h264_qpel16_h_lowpass_l2_sse2(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\ void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\ void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\ void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h);\ @@ -177,9 +178,6 @@ ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, i SSSE3_HV2_LOWPASS_WRAPPER(avg) SSSE3_HV2_LOWPASS_WRAPPER(put) -QPEL_H264_H16(avg_, sse2) -QPEL_H264_H16(put_, sse2) - #define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2 #define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2 #define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2 diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm index dc55a8ad93..101ab21647 100644 --- a/libavcodec/x86/h264_qpel_8bit.asm +++ b/libavcodec/x86/h264_qpel_8bit.asm @@ -276,6 +276,80 @@ QPEL8_H_LOWPASS_L2_OP put QPEL8_H_LOWPASS_L2_OP avg +%macro QPEL16_H_LOWPASS_L2 1 +%if ARCH_X86_64 +cglobal %1_h264_qpel16_h_lowpass_l2, 5,6,9 ; dst, src, src2, dstStride, srcStride + mova m8, [pw_16] +%define PW_16 m8 +%else +cglobal %1_h264_qpel16_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, srcStride +%define PW_16 [pw_16] +%endif + movsxdifnidn r3, r3d + movsxdifnidn r4, r4d + mov r5d, 16 + pxor m7, m7 + mova m6, [pw_5] +.loop: + movu m0, [r1] + movu m2, [r1+1] + mova m1, m0 + mova m3, m2 + punpcklbw m0, m7 + punpcklbw m2, m7 + punpckhbw m1, m7 + punpckhbw m3, m7 + paddw m0, m2 + paddw m1, m3 + psllw m0, 2 + psllw m1, 2 + movu m2, [r1-1] + movu m4, [r1+2] + mova m3, m2 + mova m5, m4 + punpcklbw m2, m7 + punpcklbw m4, m7 + punpckhbw m3, m7 + punpckhbw m5, m7 + paddw m2, m4 + paddw m3, m5 + psubw m0, m2 + psubw m1, m3 + pmullw m0, m6 + pmullw m1, m6 + movu m2, [r1-2] + movu m4, [r1+3] + mova m3, m2 + mova m5, m4 + punpcklbw m2, m7 + punpcklbw m4, m7 + punpckhbw m3, m7 + punpckhbw m5, m7 + paddw m2, m4 + paddw m3, m5 + paddw m0, m2 + paddw m1, m3 + paddw m0, PW_16 + paddw m1, PW_16 + psraw m0, 5 + psraw m1, 5 + packuswb m0, m1 + movu m4, [r2] + pavgb m0, m4 + op_%1 m0, [r0], m4 + add r0, r3 + add r1, r3 + add r2, r4 + dec r5d + jg .loop + RET +%endmacro + +INIT_XMM sse2 +QPEL16_H_LOWPASS_L2 put +QPEL16_H_LOWPASS_L2 avg + + %macro QPEL8_H_LOWPASS_L2_OP_XMM 1 cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride movsxdifnidn r3, r3d |