avcodec/x86/h264_qpel_8bit: Optimize branch away

ff_{avg,put}_h264_qpel8or16_hv2_lowpass_ssse3() currently is almost the disjoint union of the codepaths for sizes 8 and 16. This size is a compile-time constant at every callsite. So split the function and avoid the runtime branch. Reviewed-by: James Almer <[email protected]> Signed-off-by: Andreas Rheinhardt <[email protected]>
author: Andreas Rheinhardt <[email protected]> 2025-09-29 04:30:19 +0200
committer: Andreas Rheinhardt <[email protected]> 2025-10-04 07:06:33 +0200
commit: fa9ea5113b48904daef9df6a282bd9c04c32258d (patch)
tree: 981183e0fa96063211bd45899c9dfda409f20688
parent: 400203c00c3b5c766d2c3f79c72d644d3d38ee55 (diff)
2 files changed, 30 insertions, 11 deletions
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index 53b3ffb653..010cb51991 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -66,7 +66,8 @@ void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(const uint8_t *src, int16_
 void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\
 void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src, int16_t *tmp, int srcStride, int size);\
 void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int h);\
-void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int size);\
+void ff_ ## OPNAME ## _h264_qpel8_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride);\
+void ff_ ## OPNAME ## _h264_qpel16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride);\
 void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\
 void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\
 void ff_ ## OPNAME ## _pixels16_l2_shift5_sse2(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\
@@ -172,6 +173,18 @@ static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uin
     ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, srcStride, 16);\
 }\
 
+#define SSSE3_HV2_LOWPASS_WRAPPER(OPNAME) \
+static av_always_inline void \
+ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int size) \
+{\
+    if (size == 8)\
+        ff_ ## OPNAME ## _h264_qpel8_hv2_lowpass_ssse3(dst, tmp, dstStride);\
+    else\
+        ff_ ## OPNAME ## _h264_qpel16_hv2_lowpass_ssse3(dst, tmp, dstStride);\
+}
+SSSE3_HV2_LOWPASS_WRAPPER(avg)
+SSSE3_HV2_LOWPASS_WRAPPER(put)
+
 #define ff_put_h264_qpel8_h_lowpass_l2_sse2  ff_put_h264_qpel8_h_lowpass_l2_mmxext
 #define ff_avg_h264_qpel8_h_lowpass_l2_sse2  ff_avg_h264_qpel8_h_lowpass_l2_mmxext
 #define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext
diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm
index 9c4957f8e7..7b6b51be04 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -608,11 +608,14 @@ QPEL8OR16_HV2_LOWPASS_OP put
 QPEL8OR16_HV2_LOWPASS_OP avg
 
 %macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
-cglobal %1_h264_qpel8or16_hv2_lowpass, 4,4,8 ; dst, tmp, dstStride, size
+%ifidn %1, avg
+cglobal %1_h264_qpel8_hv2_lowpass, 3,4,7 ; dst, tmp, dstStride
+%else
+cglobal %1_h264_qpel8_hv2_lowpass, 3,4,6 ; dst, tmp, dstStride
+%endif
     movsxdifnidn  r2, r2d
-    cmp          r3d, 16
-    je         .op16
-.loop8:
+    mov          r3d, 8
+.loop:
     mova          m1, [r1+16]
     mova          m0, [r1]
     mova          m2, m1
@@ -635,13 +638,17 @@ cglobal %1_h264_qpel8or16_hv2_lowpass, 4,4,8 ; dst, tmp, dstStride, size
     paddw         m0, m2
     psraw         m0, 6
     packuswb      m0, m0
-    op_%1h        m0, [r0], m7
+    op_%1h        m0, [r0], m6
     add           r1, 48
     add           r0, r2
     dec          r3d
-    jne       .loop8
-    jmp        .done
-.op16:
+    jne        .loop
+    RET
+
+cglobal %1_h264_qpel16_hv2_lowpass, 3,4,8 ; dst, tmp, dstStride
+    movsxdifnidn  r2, r2d
+    mov          r3d, 16
+.loop:
     mova          m4, [r1+32]
     mova          m5, [r1+16]
     mova          m7, [r1]
@@ -688,8 +695,7 @@ cglobal %1_h264_qpel8or16_hv2_lowpass, 4,4,8 ; dst, tmp, dstStride, size
     add           r1, 48
     add           r0, r2
     dec          r3d
-    jne        .op16
-.done:
+    jne        .loop
     RET
 %endmacro
author	Andreas Rheinhardt <[email protected]>	2025-09-29 04:30:19 +0200
committer	Andreas Rheinhardt <[email protected]>	2025-10-04 07:06:33 +0200
commit	fa9ea5113b48904daef9df6a282bd9c04c32258d (patch)
tree	981183e0fa96063211bd45899c9dfda409f20688
parent	400203c00c3b5c766d2c3f79c72d644d3d38ee55 (diff)