aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/dsputil_yasm.asm
diff options
context:
space:
mode:
authorMichael Niedermayer <michaelni@gmx.at>2012-01-31 01:14:58 +0100
committerMichael Niedermayer <michaelni@gmx.at>2012-01-31 02:46:26 +0100
commit151ecc2aecd81718e2520936dd3c537d7e6fe2fc (patch)
treed30c9048773a1138a4567fead3fd2c440db745bc /libavcodec/x86/dsputil_yasm.asm
parentb8c16558828e73e933ae73b5888345d50e897dfc (diff)
parentd7edd359ec28142120eb7fde77b775309b6038d8 (diff)
downloadffmpeg-151ecc2aecd81718e2520936dd3c537d7e6fe2fc.tar.gz
Merge remote-tracking branch 'qatar/master'
* qatar/master: (26 commits) avconv: deprecate the -deinterlace option doc: Fix the name of the new function aacenc: make sure to encode enough frames to cover all input samples. aacenc: only use the number of input samples provided by the user. wmadec: Verify bitstream size makes sense before calling init_get_bits. kmvc: Log into a context at a log level constant. mpeg12: Pad framerate tab to 16 entries. kgv1dec: Increase offsets array size so it is large enough. kmvc: Check palsize. nsvdec: Propagate errors nsvdec: Be more careful with av_malloc(). nsvdec: Fix use of uninitialized streams. movenc: cosmetics: Get rid of camelCase identifiers swscale: more generic check for planar destination formats with alpha doc: Document mov/mp4 fragmentation options build: Use order-only prerequisites for creating FATE reference file dirs. x86 dsputil: provide SSE2/SSSE3 versions of bswap_buf rtsp: Remove some unused variables from ff_rtsp_connect(). avutil: make intfloat api public avformat_write_header(): detail error message ... Conflicts: doc/APIchanges doc/ffmpeg.texi doc/muxers.texi ffmpeg.c libavcodec/kmvc.c libavcodec/x86/Makefile libavcodec/x86/dsputil_yasm.asm libavcodec/x86/pngdsp-init.c libavformat/movenc.c libavformat/movenc.h libavformat/mpegtsenc.c libavformat/nsvdec.c libavformat/utils.c libavutil/avutil.h libswscale/swscale.c Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86/dsputil_yasm.asm')
-rw-r--r--libavcodec/x86/dsputil_yasm.asm124
1 files changed, 124 insertions, 0 deletions
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 6c52ac1ffb..ad9ec2c339 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -30,6 +30,7 @@ pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
pd_16384: times 4 dd 16384
+pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
SECTION_TEXT
@@ -1180,3 +1181,126 @@ BUTTERFLIES_FLOAT_INTERLEAVE
INIT_YMM avx
BUTTERFLIES_FLOAT_INTERLEAVE
%endif
+
+INIT_XMM sse2
+; %1 = aligned/unaligned
+%macro BSWAP_LOOPS_SSE2 1
+ mov r3, r2
+ sar r2, 3
+ jz .left4_%1
+.loop8_%1:
+ mov%1 m0, [r1 + 0]
+ mov%1 m1, [r1 + 16]
+ pshuflw m0, m0, 10110001b
+ pshuflw m1, m1, 10110001b
+ pshufhw m0, m0, 10110001b
+ pshufhw m1, m1, 10110001b
+ mova m2, m0
+ mova m3, m1
+ psllw m0, 8
+ psllw m1, 8
+ psrlw m2, 8
+ psrlw m3, 8
+ por m2, m0
+ por m3, m1
+ mova [r0 + 0], m2
+ mova [r0 + 16], m3
+ add r1, 32
+ add r0, 32
+ dec r2
+ jnz .loop8_%1
+.left4_%1:
+ mov r2, r3
+ and r3, 4
+ jz .left
+ mov%1 m0, [r1]
+ pshuflw m0, m0, 10110001b
+ pshufhw m0, m0, 10110001b
+ mova m2, m0
+ psllw m0, 8
+ psrlw m2, 8
+ por m2, m0
+ mova [r0], m2
+ add r1, 16
+ add r0, 16
+%endmacro
+
+; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
+cglobal bswap32_buf, 3,4,5
+ mov r3, r1
+ and r3, 15
+ jz .start_align
+ BSWAP_LOOPS_SSE2 u
+ jmp .left
+.start_align:
+ BSWAP_LOOPS_SSE2 a
+.left:
+ and r2, 3
+ jz .end
+.loop2:
+ mov r3d, [r1]
+ bswap r3d
+ mov [r0], r3d
+ add r1, 4
+ add r0, 4
+ dec r2
+ jnz .loop2
+.end
+ RET
+
+; %1 = aligned/unaligned
+%macro BSWAP_LOOPS_SSSE3 1
+ mov r3, r2
+ sar r2, 3
+ jz .left4_%1
+.loop8_%1:
+ mov%1 m0, [r1 + 0]
+ mov%1 m1, [r1 + 16]
+ pshufb m0, m2
+ pshufb m1, m2
+ mova [r0 + 0], m0
+ mova [r0 + 16], m1
+ add r0, 32
+ add r1, 32
+ dec r2
+ jnz .loop8_%1
+.left4_%1:
+ mov r2, r3
+ and r3, 4
+ jz .left2
+ mov%1 m0, [r1]
+ pshufb m0, m2
+ mova [r0], m0
+ add r1, 16
+ add r0, 16
+%endmacro
+
+INIT_XMM ssse3
+; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
+cglobal bswap32_buf, 3,4,3
+ mov r3, r1
+ mova m2, [pb_bswap32]
+ and r3, 15
+ jz .start_align
+ BSWAP_LOOPS_SSSE3 u
+ jmp .left2
+.start_align:
+ BSWAP_LOOPS_SSSE3 a
+.left2:
+ mov r3, r2
+ and r2, 2
+ jz .left1
+ movq m0, [r1]
+ pshufb m0, m2
+ movq [r0], m0
+ add r1, 8
+ add r0, 8
+.left1:
+ and r3, 1
+ jz .end
+ mov r2d, [r1]
+ bswap r2d
+ mov [r0], r2d
+.end:
+ RET
+