aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86
diff options
context:
space:
mode:
authorLoren Merritt <lorenm@u.washington.edu>2009-10-18 20:10:10 +0000
committerLoren Merritt <lorenm@u.washington.edu>2009-10-18 20:10:10 +0000
commit2f77923d72c35f4a10b9bb1d1086d0edd7f43dde (patch)
tree2c9e7c89285fb2e5117f3d0b4aad5da055aa0fc1 /libavcodec/x86
parent1303d62d8416fa315a0cc7bbbe35cfdab787ea92 (diff)
downloadffmpeg-2f77923d72c35f4a10b9bb1d1086d0edd7f43dde.tar.gz
simd add_hfyu_left_prediction
2.2x faster than C on conroe, 3.6x on penryn. 4-6% faster huffyuv decoding if using left or plane mode and yuv Originally committed as revision 20287 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/dsputil_mmx.c7
-rw-r--r--libavcodec/x86/dsputil_yasm.asm74
-rw-r--r--libavcodec/x86/x86inc.asm2
3 files changed, 83 insertions, 0 deletions
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 2e00aa2a24..3f6f1dae6b 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2385,6 +2385,8 @@ void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *top, uint8_t *diff, int w, int *left, int *left_top);
+int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, uint8_t *src, int w, int left);
+int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, uint8_t *src, int w, int left);
void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
@@ -2951,6 +2953,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3;
c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3;
c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
+#if HAVE_YASM
+ c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
+ if (mm_flags & FF_MM_SSE4) // not really sse4, just slow on Conroe
+ c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
+#endif
}
#endif
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 9d5e62e57d..d071186f5c 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -21,6 +21,13 @@
%include "x86inc.asm"
+SECTION_RODATA
+pb_f: times 16 db 15
+pb_zzzzzzzz77777777: times 8 db -1
+pb_7: times 8 db 7
+pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
+pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
+
section .text align=16
%macro PSWAPD_SSE 2
@@ -150,3 +157,70 @@ cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_to
movzx r2d, byte [topq-1]
mov [left_topq], r2d
RET
+
+
+%macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
+ add srcq, wq
+ add dstq, wq
+ neg wq
+%%.loop:
+ mova m1, [srcq+wq]
+ mova m2, m1
+ psllw m1, 8
+ paddb m1, m2
+ mova m2, m1
+ pshufb m1, m3
+ paddb m1, m2
+ pshufb m0, m5
+ mova m2, m1
+ pshufb m1, m4
+ paddb m1, m2
+%if mmsize == 16
+ mova m2, m1
+ pshufb m1, m6
+ paddb m1, m2
+%endif
+ paddb m0, m1
+%if %1
+ mova [dstq+wq], m0
+%else
+ movq [dstq+wq], m0
+ movhps [dstq+wq+8], m0
+%endif
+ add wq, mmsize
+ jl %%.loop
+ mov eax, mmsize-1
+ sub eax, wd
+ movd m1, eax
+ pshufb m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+; int ff_add_hfyu_left_prediction(uint8_t *dst, uint8_t *src, int w, int left)
+INIT_MMX
+cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
+.skip_prologue:
+ mova m5, [pb_7 GLOBAL]
+ mova m4, [pb_zzzz3333zzzzbbbb GLOBAL]
+ mova m3, [pb_zz11zz55zz99zzdd GLOBAL]
+ movd m0, leftm
+ psllq m0, 56
+ ADD_HFYU_LEFT_LOOP 1
+
+INIT_XMM
+cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
+ mova m5, [pb_f GLOBAL]
+ mova m6, [pb_zzzzzzzz77777777 GLOBAL]
+ mova m4, [pb_zzzz3333zzzzbbbb GLOBAL]
+ mova m3, [pb_zz11zz55zz99zzdd GLOBAL]
+ movd m0, leftm
+ pslldq m0, 15
+ test srcq, 15
+ jnz ff_add_hfyu_left_prediction_ssse3 %+ .skip_prologue
+ test dstq, 15
+ jnz .unaligned
+ ADD_HFYU_LEFT_LOOP 1
+.unaligned:
+ ADD_HFYU_LEFT_LOOP 0
+
diff --git a/libavcodec/x86/x86inc.asm b/libavcodec/x86/x86inc.asm
index 52624c3aca..e49c34f1b1 100644
--- a/libavcodec/x86/x86inc.asm
+++ b/libavcodec/x86/x86inc.asm
@@ -221,6 +221,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
CAT_UNDEF arg_name %+ %%i, d
CAT_UNDEF arg_name %+ %%i, w
CAT_UNDEF arg_name %+ %%i, b
+ CAT_UNDEF arg_name %+ %%i, m
CAT_UNDEF arg_name, %%i
%assign %%i %%i+1
%endrep
@@ -232,6 +233,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
%xdefine %1d r %+ %%i %+ d
%xdefine %1w r %+ %%i %+ w
%xdefine %1b r %+ %%i %+ b
+ %xdefine %1m r %+ %%i %+ m
CAT_XDEFINE arg_name, %%i, %1
%assign %%i %%i+1
%rotate 1