diff options
author | Andreas Rheinhardt <[email protected]> | 2025-09-29 02:31:42 +0200 |
---|---|---|
committer | Andreas Rheinhardt <[email protected]> | 2025-10-04 07:06:33 +0200 |
commit | 810bd3e62a4935683f2299dc4ebce2fb05c0d764 (patch) | |
tree | 797779a3a4fd2cc2deae9a50077c9934752fcc79 | |
parent | 279b6f3cf56dbfcfb3980a1d955d66628e80aa24 (diff) |
avcodec/x86/h264_qpel: Add ff_{avg,put}_pixels16_l2_shift5_sse2
Up until now this function was emulated via two calls
to ff_{avg,pull}_pixels8_l2_shift5_mmxext(). Adding a dedicated
function proved beneficial both size wise and performance wise:
The new functions take 192B, yet the simplified calls save
256B with GCC and 320B with Clang here.
This change will also allow further optimizations.
Old benchmarks:
avg_h264_qpel_16_mc12_8_c: 1735.8 ( 1.00x)
avg_h264_qpel_16_mc12_8_sse2: 300.8 ( 5.77x)
avg_h264_qpel_16_mc12_8_ssse3: 233.3 ( 7.44x)
avg_h264_qpel_16_mc32_8_c: 1777.9 ( 1.00x)
avg_h264_qpel_16_mc32_8_sse2: 275.6 ( 6.45x)
avg_h264_qpel_16_mc32_8_ssse3: 235.7 ( 7.54x)
put_h264_qpel_16_mc12_8_c: 1808.2 ( 1.00x)
put_h264_qpel_16_mc12_8_sse2: 267.2 ( 6.77x)
put_h264_qpel_16_mc12_8_ssse3: 231.9 ( 7.80x)
put_h264_qpel_16_mc32_8_c: 1766.9 ( 1.00x)
put_h264_qpel_16_mc32_8_sse2: 272.9 ( 6.47x)
put_h264_qpel_16_mc32_8_ssse3: 229.5 ( 7.70x)
New benchmarks:
avg_h264_qpel_16_mc12_8_c: 1742.3 ( 1.00x)
avg_h264_qpel_16_mc12_8_sse2: 240.3 ( 7.25x)
avg_h264_qpel_16_mc12_8_ssse3: 214.8 ( 8.11x)
avg_h264_qpel_16_mc32_8_c: 1748.0 ( 1.00x)
avg_h264_qpel_16_mc32_8_sse2: 238.0 ( 7.35x)
avg_h264_qpel_16_mc32_8_ssse3: 209.2 ( 8.35x)
put_h264_qpel_16_mc12_8_c: 2014.4 ( 1.00x)
put_h264_qpel_16_mc12_8_sse2: 243.7 ( 8.27x)
put_h264_qpel_16_mc12_8_ssse3: 211.5 ( 9.52x)
put_h264_qpel_16_mc32_8_c: 1800.0 ( 1.00x)
put_h264_qpel_16_mc32_8_sse2: 238.8 ( 7.54x)
put_h264_qpel_16_mc32_8_ssse3: 206.7 ( 8.71x)
Reviewed-by: James Almer <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
-rw-r--r-- | libavcodec/x86/h264_qpel.c | 12 | ||||
-rw-r--r-- | libavcodec/x86/h264_qpel_8bit.asm | 27 |
2 files changed, 22 insertions, 17 deletions
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index 649bfabda8..b782d32bea 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -68,7 +68,8 @@ void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src, in void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\ void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\ void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);\ -void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h); +void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);\ +void ff_ ## OPNAME ## _pixels16_l2_shift5_sse2(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);\ DEF_QPEL(avg) DEF_QPEL(put) @@ -104,12 +105,6 @@ static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(u ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ }\ -\ -static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h)\ -{\ - ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\ - ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\ -}\ #if ARCH_X86_64 @@ -191,6 +186,9 @@ static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uin #define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext #define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext +#define ff_put_pixels16_l2_shift5_mmxext ff_put_pixels16_l2_shift5_sse2 +#define ff_avg_pixels16_l2_shift5_mmxext ff_avg_pixels16_l2_shift5_sse2 + #define H264_MC_V_H_HV(OPNAME, SIZE, MMX, ALIGN) \ H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\ H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\ diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm index 07056f1215..fefa3aff01 100644 --- a/libavcodec/x86/h264_qpel_8bit.asm +++ b/libavcodec/x86/h264_qpel_8bit.asm @@ -734,15 +734,19 @@ PIXELS4_L2_SHIFT5 put PIXELS4_L2_SHIFT5 avg -%macro PIXELS8_L2_SHIFT5 1 -cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h +%macro PIXELS_L2_SHIFT5 2 +%if cpuflag(sse2) +cglobal %1_pixels%2_l2_shift5, 6, 6, 4 ; dst, src16, src8, dstStride, src8Stride, h +%else +cglobal %1_pixels%2_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h +%endif movsxdifnidn r3, r3d movsxdifnidn r4, r4d .loop: - mova m0, [r1] - mova m1, [r1+8] - mova m2, [r1+48] - mova m3, [r1+48+8] + movu m0, [r1] + movu m1, [r1+%2] + movu m2, [r1+48] + movu m3, [r1+48+%2] psraw m0, 5 psraw m1, 5 psraw m2, 5 @@ -751,8 +755,8 @@ cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h packuswb m2, m3 pavgb m0, [r2] pavgb m2, [r2+r4] - op_%1 m0, [r0], m4 - op_%1 m2, [r0+r3], m5 + op_%1 m0, [r0], m1 + op_%1 m2, [r0+r3], m1 lea r2, [r2+2*r4] add r1, 48*2 lea r0, [r0+2*r3] @@ -762,9 +766,12 @@ cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h %endmacro INIT_MMX mmxext -PIXELS8_L2_SHIFT5 put -PIXELS8_L2_SHIFT5 avg +PIXELS_L2_SHIFT5 put, 8 +PIXELS_L2_SHIFT5 avg, 8 +INIT_XMM sse2 +PIXELS_L2_SHIFT5 put, 16 +PIXELS_L2_SHIFT5 avg, 16 %if ARCH_X86_64 %macro QPEL16_H_LOWPASS_L2_OP 1 |