diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2012-01-31 01:14:58 +0100 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2012-01-31 02:46:26 +0100 |
commit | 151ecc2aecd81718e2520936dd3c537d7e6fe2fc (patch) | |
tree | d30c9048773a1138a4567fead3fd2c440db745bc /libavcodec/x86/dsputil_yasm.asm | |
parent | b8c16558828e73e933ae73b5888345d50e897dfc (diff) | |
parent | d7edd359ec28142120eb7fde77b775309b6038d8 (diff) | |
download | ffmpeg-151ecc2aecd81718e2520936dd3c537d7e6fe2fc.tar.gz |
Merge remote-tracking branch 'qatar/master'
* qatar/master: (26 commits)
avconv: deprecate the -deinterlace option
doc: Fix the name of the new function
aacenc: make sure to encode enough frames to cover all input samples.
aacenc: only use the number of input samples provided by the user.
wmadec: Verify bitstream size makes sense before calling init_get_bits.
kmvc: Log into a context at a log level constant.
mpeg12: Pad framerate tab to 16 entries.
kgv1dec: Increase offsets array size so it is large enough.
kmvc: Check palsize.
nsvdec: Propagate errors
nsvdec: Be more careful with av_malloc().
nsvdec: Fix use of uninitialized streams.
movenc: cosmetics: Get rid of camelCase identifiers
swscale: more generic check for planar destination formats with alpha
doc: Document mov/mp4 fragmentation options
build: Use order-only prerequisites for creating FATE reference file dirs.
x86 dsputil: provide SSE2/SSSE3 versions of bswap_buf
rtsp: Remove some unused variables from ff_rtsp_connect().
avutil: make intfloat api public
avformat_write_header(): detail error message
...
Conflicts:
doc/APIchanges
doc/ffmpeg.texi
doc/muxers.texi
ffmpeg.c
libavcodec/kmvc.c
libavcodec/x86/Makefile
libavcodec/x86/dsputil_yasm.asm
libavcodec/x86/pngdsp-init.c
libavformat/movenc.c
libavformat/movenc.h
libavformat/mpegtsenc.c
libavformat/nsvdec.c
libavformat/utils.c
libavutil/avutil.h
libswscale/swscale.c
Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86/dsputil_yasm.asm')
-rw-r--r-- | libavcodec/x86/dsputil_yasm.asm | 124 |
1 files changed, 124 insertions, 0 deletions
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm index 6c52ac1ffb..ad9ec2c339 100644 --- a/libavcodec/x86/dsputil_yasm.asm +++ b/libavcodec/x86/dsputil_yasm.asm @@ -30,6 +30,7 @@ pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 pd_16384: times 4 dd 16384 +pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 SECTION_TEXT @@ -1180,3 +1181,126 @@ BUTTERFLIES_FLOAT_INTERLEAVE INIT_YMM avx BUTTERFLIES_FLOAT_INTERLEAVE %endif + +INIT_XMM sse2 +; %1 = aligned/unaligned +%macro BSWAP_LOOPS_SSE2 1 + mov r3, r2 + sar r2, 3 + jz .left4_%1 +.loop8_%1: + mov%1 m0, [r1 + 0] + mov%1 m1, [r1 + 16] + pshuflw m0, m0, 10110001b + pshuflw m1, m1, 10110001b + pshufhw m0, m0, 10110001b + pshufhw m1, m1, 10110001b + mova m2, m0 + mova m3, m1 + psllw m0, 8 + psllw m1, 8 + psrlw m2, 8 + psrlw m3, 8 + por m2, m0 + por m3, m1 + mova [r0 + 0], m2 + mova [r0 + 16], m3 + add r1, 32 + add r0, 32 + dec r2 + jnz .loop8_%1 +.left4_%1: + mov r2, r3 + and r3, 4 + jz .left + mov%1 m0, [r1] + pshuflw m0, m0, 10110001b + pshufhw m0, m0, 10110001b + mova m2, m0 + psllw m0, 8 + psrlw m2, 8 + por m2, m0 + mova [r0], m2 + add r1, 16 + add r0, 16 +%endmacro + +; void bswap_buf(uint32_t *dst, const uint32_t *src, int w); +cglobal bswap32_buf, 3,4,5 + mov r3, r1 + and r3, 15 + jz .start_align + BSWAP_LOOPS_SSE2 u + jmp .left +.start_align: + BSWAP_LOOPS_SSE2 a +.left: + and r2, 3 + jz .end +.loop2: + mov r3d, [r1] + bswap r3d + mov [r0], r3d + add r1, 4 + add r0, 4 + dec r2 + jnz .loop2 +.end + RET + +; %1 = aligned/unaligned +%macro BSWAP_LOOPS_SSSE3 1 + mov r3, r2 + sar r2, 3 + jz .left4_%1 +.loop8_%1: + mov%1 m0, [r1 + 0] + mov%1 m1, [r1 + 16] + pshufb m0, m2 + pshufb m1, m2 + mova [r0 + 0], m0 + mova [r0 + 16], m1 + add r0, 32 + add r1, 32 + dec r2 + jnz .loop8_%1 +.left4_%1: + mov r2, r3 + and r3, 4 + jz .left2 + mov%1 m0, [r1] + pshufb m0, m2 + mova [r0], m0 + add r1, 16 + add r0, 16 +%endmacro + +INIT_XMM ssse3 +; void bswap_buf(uint32_t *dst, const uint32_t *src, int w); +cglobal bswap32_buf, 3,4,3 + mov r3, r1 + mova m2, [pb_bswap32] + and r3, 15 + jz .start_align + BSWAP_LOOPS_SSSE3 u + jmp .left2 +.start_align: + BSWAP_LOOPS_SSSE3 a +.left2: + mov r3, r2 + and r2, 2 + jz .left1 + movq m0, [r1] + pshufb m0, m2 + movq [r0], m0 + add r1, 8 + add r0, 8 +.left1: + and r3, 1 + jz .end + mov r2d, [r1] + bswap r2d + mov [r0], r2d +.end: + RET + |