diff options
author | Ronald S. Bultje <rsbultje@gmail.com> | 2017-04-04 12:17:08 -0400 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2017-04-06 10:03:28 -0400 |
commit | 2f0591cfa3b773d7a2fec72b30ec25d4ffb0cb32 (patch) | |
tree | 83b57302f1308f0162c22927bd47fe82179d58fb /libavcodec/x86/cavsidct.asm | |
parent | c9d98c5649ac11617200bf19b1e027505251d3cf (diff) | |
download | ffmpeg-2f0591cfa3b773d7a2fec72b30ec25d4ffb0cb32.tar.gz |
cavs: add a sse2 idct implementation.
This makes using the function pointer ff_add_pixels_clamped() unnecessary,
since we always know what the best implementation is at compile-time.
Diffstat (limited to 'libavcodec/x86/cavsidct.asm')
-rw-r--r-- | libavcodec/x86/cavsidct.asm | 48 |
1 files changed, 47 insertions, 1 deletions
diff --git a/libavcodec/x86/cavsidct.asm b/libavcodec/x86/cavsidct.asm index 5421196e1b..6c768c2646 100644 --- a/libavcodec/x86/cavsidct.asm +++ b/libavcodec/x86/cavsidct.asm @@ -29,11 +29,16 @@ cextern pw_64 SECTION .text -%macro CAVS_IDCT8_1D 2 ; source, round +%macro CAVS_IDCT8_1D 2-3 1 ; source, round, init_load +%if %3 == 1 mova m4, [%1+7*16] ; m4 = src7 mova m5, [%1+1*16] ; m5 = src1 mova m2, [%1+5*16] ; m2 = src5 mova m7, [%1+3*16] ; m7 = src3 +%else + SWAP 1, 7 + SWAP 4, 6 +%endif mova m0, m4 mova m3, m5 mova m6, m2 @@ -163,3 +168,44 @@ cglobal cavs_idct8, 2, 4, 8, 8 * 16, out, in, cnt, tmp jg .loop_2 RET + +INIT_XMM sse2 +cglobal cavs_idct8, 2, 2, 8 + ARCH_X86_64, 0 - 8 * 16, out, in + CAVS_IDCT8_1D inq, [pw_4] + psraw m7, 3 + psraw m6, 3 + psraw m5, 3 + psraw m4, 3 + psraw m3, 3 + psraw m2, 3 + psraw m1, 3 + psraw m0, 3 +%if ARCH_X86_64 + TRANSPOSE8x8W 7, 5, 3, 1, 0, 2, 4, 6, 8 + mova [rsp+4*16], m0 +%else + mova [rsp+0*16], m4 + TRANSPOSE8x8W 7, 5, 3, 1, 0, 2, 4, 6, [rsp+0*16], [rsp+4*16], 1 +%endif + mova [rsp+0*16], m7 + mova [rsp+2*16], m3 + mova [rsp+6*16], m4 + CAVS_IDCT8_1D rsp, [pw_64], 0 + psraw m7, 7 + psraw m6, 7 + psraw m5, 7 + psraw m4, 7 + psraw m3, 7 + psraw m2, 7 + psraw m1, 7 + psraw m0, 7 + + mova [outq+0*16], m7 + mova [outq+1*16], m5 + mova [outq+2*16], m3 + mova [outq+3*16], m1 + mova [outq+4*16], m0 + mova [outq+5*16], m2 + mova [outq+6*16], m4 + mova [outq+7*16], m6 + RET |