diff options
author | Henrik Gramner <henrik@gramner.com> | 2015-07-16 00:10:27 +0200 |
---|---|---|
committer | Luca Barbato <lu_zero@gentoo.org> | 2015-07-17 20:02:28 +0200 |
commit | a344e5d094ebcf9a23acf3a27c56cbbbc829db42 (patch) | |
tree | c63923f571dd3ce2c776dbe4ad1a2b6184d783b1 | |
parent | f5ee23004d1177ca6dd99b92cb4ff4b94b2eae09 (diff) | |
download | ffmpeg-a344e5d094ebcf9a23acf3a27c56cbbbc829db42.tar.gz |
x86: bswapdsp: Don't treat 32-bit integers as 64-bit
The upper halves are not guaranteed to be zero in x86-64.
Also use `test` instead of `and` when the result isn't used for anything other
than as a branch condition, this allows some register moves to be eliminated.
Signed-off-by: Luca Barbato <lu_zero@gentoo.org>
-rw-r--r-- | libavcodec/x86/bswapdsp.asm | 23 |
1 files changed, 10 insertions, 13 deletions
diff --git a/libavcodec/x86/bswapdsp.asm b/libavcodec/x86/bswapdsp.asm index 17a6cb1be3..42580a392c 100644 --- a/libavcodec/x86/bswapdsp.asm +++ b/libavcodec/x86/bswapdsp.asm @@ -28,8 +28,8 @@ SECTION_TEXT ; %1 = aligned/unaligned %macro BSWAP_LOOPS 1 - mov r3, r2 - sar r2, 3 + mov r3d, r2d + sar r2d, 3 jz .left4_%1 .loop8_%1: mov%1 m0, [r1 + 0] @@ -57,11 +57,11 @@ SECTION_TEXT %endif add r0, 32 add r1, 32 - dec r2 + dec r2d jnz .loop8_%1 .left4_%1: - mov r2, r3 - and r3, 4 + mov r2d, r3d + test r3d, 4 jz .left mov%1 m0, [r1] %if cpuflag(ssse3) @@ -84,13 +84,11 @@ SECTION_TEXT %macro BSWAP32_BUF 0 %if cpuflag(ssse3) cglobal bswap32_buf, 3,4,3 - mov r3, r1 mova m2, [pb_bswap32] %else cglobal bswap32_buf, 3,4,5 - mov r3, r1 %endif - and r3, 15 + test r1, 15 jz .start_align BSWAP_LOOPS u jmp .left @@ -98,8 +96,7 @@ cglobal bswap32_buf, 3,4,5 BSWAP_LOOPS a .left: %if cpuflag(ssse3) - mov r3, r2 - and r2, 2 + test r2d, 2 jz .left1 movq m0, [r1] pshufb m0, m2 @@ -107,13 +104,13 @@ cglobal bswap32_buf, 3,4,5 add r1, 8 add r0, 8 .left1: - and r3, 1 + test r2d, 1 jz .end mov r2d, [r1] bswap r2d mov [r0], r2d %else - and r2, 3 + and r2d, 3 jz .end .loop2: mov r3d, [r1] @@ -121,7 +118,7 @@ cglobal bswap32_buf, 3,4,5 mov [r0], r3d add r1, 4 add r0, 4 - dec r2 + dec r2d jnz .loop2 %endif .end: |