avcodec/fpel: Avoid loop in ff_avg_pixels4_mmxext()

It is only used by h264_qpel.c and only with height four (which is unrolled) and uses a loop in order to handle multiples of four as height. Remove the loop and the height parameter and move the function to h264_qpel_8bit.asm. This leads to a bit of code duplication, but this is simpler than all the %if checks necessary to achieve the same outcome in fpel.asm. Reviewed-by: James Almer <[email protected]> Signed-off-by: Andreas Rheinhardt <[email protected]>
author: Andreas Rheinhardt <[email protected]> 2025-09-29 00:11:47 +0200
committer: Andreas Rheinhardt <[email protected]> 2025-10-04 07:06:33 +0200
commit: 279b6f3cf56dbfcfb3980a1d955d66628e80aa24 (patch)
tree: e1cf1887e460a34d058f7f9539e62226a0469ac1
parent: e340f31b898e51d12ec1ced52fcbf03fb4635533 (diff)
4 files changed, 28 insertions, 16 deletions
diff --git a/libavcodec/x86/fpel.asm b/libavcodec/x86/fpel.asm
index 477caa8b44..8ca684efa9 100644
--- a/libavcodec/x86/fpel.asm
+++ b/libavcodec/x86/fpel.asm
@@ -63,7 +63,6 @@ INIT_MMX mmx
 OP_PIXELS put, 8
 
 INIT_MMX mmxext
-OP_PIXELS avg, 4
 OP_PIXELS avg, 8
 
 INIT_XMM sse2
diff --git a/libavcodec/x86/fpel.h b/libavcodec/x86/fpel.h
index 851a70b99f..dc69e1cd83 100644
--- a/libavcodec/x86/fpel.h
+++ b/libavcodec/x86/fpel.h
@@ -22,8 +22,6 @@
 #include <stddef.h>
 #include <stdint.h>
 
-void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
-                           ptrdiff_t line_size, int h);
 void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int h);
 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index 43e68d2d97..649bfabda8 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -30,6 +30,7 @@
 #include "fpel.h"
 
 #if HAVE_X86ASM
+void ff_avg_pixels4_mmxext(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
 void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
                               ptrdiff_t stride);
 void ff_avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
@@ -52,7 +53,6 @@ void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t
 #define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
 #define ff_avg_pixels16_l2_sse2(dst, src1, src2, dststride, src1stride, h) \
     ff_avg_pixels16_l2_mmxext((dst), (src1), (src2), (dststride), (src1stride))
-#define ff_put_pixels4_mmxext(...)
 
 #define DEF_QPEL(OPNAME)\
 void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
@@ -191,8 +191,7 @@ static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uin
 #define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext
 #define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
 
-#define H264_MC_C_V_H_HV(OPNAME, SIZE, MMX, ALIGN) \
-H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
+#define H264_MC_V_H_HV(OPNAME, SIZE, MMX, ALIGN) \
 H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
 H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
 H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
@@ -208,11 +207,11 @@ static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src,
     ff_avg_pixels16_sse2(dst, src, stride, 16);
 }
 
-#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
-av_unused static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
-{\
-    ff_ ## OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
-}\
+static void avg_h264_qpel8_mc00_mmxext(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride)
+{
+    ff_avg_pixels8_mmxext(dst, src, stride, 8);
+}
 
 #define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
@@ -346,8 +345,7 @@ QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3)
 QPEL_H264_HV_XMM(put_,       PUT_OP, ssse3)
 QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
 
-H264_MC(H264_MC_C_V_H_HV, 4, mmxext, 8)
-H264_MC_C(avg_, 8, mmxext, 8)
+H264_MC(H264_MC_V_H_HV, 4, mmxext, 8)
 H264_MC_816(H264_MC_V, sse2)
 H264_MC_816(H264_MC_HV, sse2)
 H264_MC_816(H264_MC_H, ssse3)
@@ -461,7 +459,8 @@ av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth)
         if (!high_bit_depth) {
             SET_QPEL_FUNCS_1PP(put_h264_qpel, 2,  4, mmxext, );
             c->avg_h264_qpel_pixels_tab[1][0] = avg_h264_qpel8_mc00_mmxext;
-            SET_QPEL_FUNCS(avg_h264_qpel, 2,  4, mmxext, );
+            SET_QPEL_FUNCS_1PP(avg_h264_qpel, 2,  4, mmxext, );
+            c->avg_h264_qpel_pixels_tab[2][0] = ff_avg_pixels4_mmxext;
         } else if (bit_depth == 10) {
             SET_QPEL_FUNCS(put_h264_qpel, 2, 4,  10_mmxext, ff_);
             SET_QPEL_FUNCS(avg_h264_qpel, 2, 4,  10_mmxext, ff_);
diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm
index 6a134ee5b4..07056f1215 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -25,14 +25,30 @@
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA 32
-
 cextern pw_16
 cextern pw_5
 cextern pb_0
 
 SECTION .text
 
+; void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
+;                            ptrdiff_t line_size)
+INIT_MMX mmxext
+cglobal avg_pixels4, 3,4
+    lea          r3, [r2*3]
+    movh         m0, [r1]
+    movh         m1, [r1+r2]
+    movh         m2, [r1+r2*2]
+    movh         m3, [r1+r3]
+    pavgb        m0, [r0]
+    pavgb        m1, [r0+r2]
+    pavgb        m2, [r0+r2*2]
+    pavgb        m3, [r0+r3]
+    movh       [r0], m0
+    movh    [r0+r2], m1
+    movh  [r0+r2*2], m2
+    movh    [r0+r3], m3
+    RET
 
 %macro op_avgh 3
     movh   %3, %2
author	Andreas Rheinhardt <[email protected]>	2025-09-29 00:11:47 +0200
committer	Andreas Rheinhardt <[email protected]>	2025-10-04 07:06:33 +0200
commit	279b6f3cf56dbfcfb3980a1d955d66628e80aa24 (patch)
tree	e1cf1887e460a34d058f7f9539e62226a0469ac1
parent	e340f31b898e51d12ec1ced52fcbf03fb4635533 (diff)