diff options
author | Andreas Rheinhardt <[email protected]> | 2025-09-29 04:30:19 +0200 |
---|---|---|
committer | Andreas Rheinhardt <[email protected]> | 2025-10-04 07:06:33 +0200 |
commit | fa9ea5113b48904daef9df6a282bd9c04c32258d (patch) | |
tree | 981183e0fa96063211bd45899c9dfda409f20688 | |
parent | 400203c00c3b5c766d2c3f79c72d644d3d38ee55 (diff) |
avcodec/x86/h264_qpel_8bit: Optimize branch away
ff_{avg,put}_h264_qpel8or16_hv2_lowpass_ssse3()
currently is almost the disjoint union of the codepaths
for sizes 8 and 16. This size is a compile-time constant
at every callsite. So split the function and avoid
the runtime branch.
Reviewed-by: James Almer <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
-rw-r--r-- | libavcodec/x86/h264_qpel.c | 15 | ||||
-rw-r--r-- | libavcodec/x86/h264_qpel_8bit.asm | 26 |
2 files changed, 30 insertions, 11 deletions
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index 53b3ffb653..010cb51991 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -66,7 +66,8 @@ void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(const uint8_t *src, int16_ void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\ void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src, int16_t *tmp, int srcStride, int size);\ void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int h);\ -void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int size);\ +void ff_ ## OPNAME ## _h264_qpel8_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride);\ +void ff_ ## OPNAME ## _h264_qpel16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride);\ void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\ void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\ void ff_ ## OPNAME ## _pixels16_l2_shift5_sse2(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\ @@ -172,6 +173,18 @@ static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uin ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, srcStride, 16);\ }\ +#define SSSE3_HV2_LOWPASS_WRAPPER(OPNAME) \ +static av_always_inline void \ +ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int size) \ +{\ + if (size == 8)\ + ff_ ## OPNAME ## _h264_qpel8_hv2_lowpass_ssse3(dst, tmp, dstStride);\ + else\ + ff_ ## OPNAME ## _h264_qpel16_hv2_lowpass_ssse3(dst, tmp, dstStride);\ +} +SSSE3_HV2_LOWPASS_WRAPPER(avg) +SSSE3_HV2_LOWPASS_WRAPPER(put) + #define ff_put_h264_qpel8_h_lowpass_l2_sse2 ff_put_h264_qpel8_h_lowpass_l2_mmxext #define ff_avg_h264_qpel8_h_lowpass_l2_sse2 ff_avg_h264_qpel8_h_lowpass_l2_mmxext #define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm index 9c4957f8e7..7b6b51be04 100644 --- a/libavcodec/x86/h264_qpel_8bit.asm +++ b/libavcodec/x86/h264_qpel_8bit.asm @@ -608,11 +608,14 @@ QPEL8OR16_HV2_LOWPASS_OP put QPEL8OR16_HV2_LOWPASS_OP avg %macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1 -cglobal %1_h264_qpel8or16_hv2_lowpass, 4,4,8 ; dst, tmp, dstStride, size +%ifidn %1, avg +cglobal %1_h264_qpel8_hv2_lowpass, 3,4,7 ; dst, tmp, dstStride +%else +cglobal %1_h264_qpel8_hv2_lowpass, 3,4,6 ; dst, tmp, dstStride +%endif movsxdifnidn r2, r2d - cmp r3d, 16 - je .op16 -.loop8: + mov r3d, 8 +.loop: mova m1, [r1+16] mova m0, [r1] mova m2, m1 @@ -635,13 +638,17 @@ cglobal %1_h264_qpel8or16_hv2_lowpass, 4,4,8 ; dst, tmp, dstStride, size paddw m0, m2 psraw m0, 6 packuswb m0, m0 - op_%1h m0, [r0], m7 + op_%1h m0, [r0], m6 add r1, 48 add r0, r2 dec r3d - jne .loop8 - jmp .done -.op16: + jne .loop + RET + +cglobal %1_h264_qpel16_hv2_lowpass, 3,4,8 ; dst, tmp, dstStride + movsxdifnidn r2, r2d + mov r3d, 16 +.loop: mova m4, [r1+32] mova m5, [r1+16] mova m7, [r1] @@ -688,8 +695,7 @@ cglobal %1_h264_qpel8or16_hv2_lowpass, 4,4,8 ; dst, tmp, dstStride, size add r1, 48 add r0, r2 dec r3d - jne .op16 -.done: + jne .loop RET %endmacro |