diff options
author | Diego Biurrun <diego@biurrun.de> | 2014-02-13 17:57:05 +0100 |
---|---|---|
committer | Diego Biurrun <diego@biurrun.de> | 2014-06-22 18:22:31 -0700 |
commit | c67b449bebbe0b35c73b203683e77a0a649bc765 (patch) | |
tree | fef2691cbb548198024dbc1461419dfdd9d3fea2 /libavcodec/x86/bswapdsp.asm | |
parent | 7b9ef8d701c319c26f7d0664fe977e176764c74e (diff) | |
download | ffmpeg-c67b449bebbe0b35c73b203683e77a0a649bc765.tar.gz |
dsputil: Split bswap*_buf() off into a separate context
Diffstat (limited to 'libavcodec/x86/bswapdsp.asm')
-rw-r--r-- | libavcodec/x86/bswapdsp.asm | 135 |
1 files changed, 135 insertions, 0 deletions
diff --git a/libavcodec/x86/bswapdsp.asm b/libavcodec/x86/bswapdsp.asm new file mode 100644 index 0000000000..17a6cb1be3 --- /dev/null +++ b/libavcodec/x86/bswapdsp.asm @@ -0,0 +1,135 @@ +;****************************************************************************** +;* optimized bswap buffer functions +;* Copyright (c) 2008 Loren Merritt +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA +pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 + +SECTION_TEXT + +; %1 = aligned/unaligned +%macro BSWAP_LOOPS 1 + mov r3, r2 + sar r2, 3 + jz .left4_%1 +.loop8_%1: + mov%1 m0, [r1 + 0] + mov%1 m1, [r1 + 16] +%if cpuflag(ssse3) + pshufb m0, m2 + pshufb m1, m2 + mov%1 [r0 + 0], m0 + mov%1 [r0 + 16], m1 +%else + pshuflw m0, m0, 10110001b + pshuflw m1, m1, 10110001b + pshufhw m0, m0, 10110001b + pshufhw m1, m1, 10110001b + mova m2, m0 + mova m3, m1 + psllw m0, 8 + psllw m1, 8 + psrlw m2, 8 + psrlw m3, 8 + por m2, m0 + por m3, m1 + mov%1 [r0 + 0], m2 + mov%1 [r0 + 16], m3 +%endif + add r0, 32 + add r1, 32 + dec r2 + jnz .loop8_%1 +.left4_%1: + mov r2, r3 + and r3, 4 + jz .left + mov%1 m0, [r1] +%if cpuflag(ssse3) + pshufb m0, m2 + mov%1 [r0], m0 +%else + pshuflw m0, m0, 10110001b + pshufhw m0, m0, 10110001b + mova m2, m0 + psllw m0, 8 + psrlw m2, 8 + por m2, m0 + mov%1 [r0], m2 +%endif + add r1, 16 + add r0, 16 +%endmacro + +; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w); +%macro BSWAP32_BUF 0 +%if cpuflag(ssse3) +cglobal bswap32_buf, 3,4,3 + mov r3, r1 + mova m2, [pb_bswap32] +%else +cglobal bswap32_buf, 3,4,5 + mov r3, r1 +%endif + and r3, 15 + jz .start_align + BSWAP_LOOPS u + jmp .left +.start_align: + BSWAP_LOOPS a +.left: +%if cpuflag(ssse3) + mov r3, r2 + and r2, 2 + jz .left1 + movq m0, [r1] + pshufb m0, m2 + movq [r0], m0 + add r1, 8 + add r0, 8 +.left1: + and r3, 1 + jz .end + mov r2d, [r1] + bswap r2d + mov [r0], r2d +%else + and r2, 3 + jz .end +.loop2: + mov r3d, [r1] + bswap r3d + mov [r0], r3d + add r1, 4 + add r0, 4 + dec r2 + jnz .loop2 +%endif +.end: + RET +%endmacro + +INIT_XMM sse2 +BSWAP32_BUF + +INIT_XMM ssse3 +BSWAP32_BUF |