diff options
author | Ronald S. Bultje <rsbultje@gmail.com> | 2012-01-27 23:21:55 +0800 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2012-01-29 18:47:50 -0800 |
commit | 59f474b49dce5032d2d446d687eef3228e941192 (patch) | |
tree | 68bd205a1a629e452692dc679996ba8cd40dac4e /libavcodec/x86/pngdsp.asm | |
parent | 3715d841a619f1cbc4776d9b00575dae6fb6534a (diff) | |
download | ffmpeg-59f474b49dce5032d2d446d687eef3228e941192.tar.gz |
png: convert DSP functions to yasm.
Diffstat (limited to 'libavcodec/x86/pngdsp.asm')
-rw-r--r-- | libavcodec/x86/pngdsp.asm | 142 |
1 files changed, 142 insertions, 0 deletions
diff --git a/libavcodec/x86/pngdsp.asm b/libavcodec/x86/pngdsp.asm new file mode 100644 index 0000000000..92425ad099 --- /dev/null +++ b/libavcodec/x86/pngdsp.asm @@ -0,0 +1,142 @@ +;****************************************************************************** +;* x86 optimizations for PNG decoding +;* +;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu> +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA + +cextern pw_255 + +section .text align=16 + +; %1 = nr. of xmm registers used +%macro ADD_BYTES_FN 1 +cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i +%if ARCH_X86_64 + movsxd waq, wad +%endif + xor iq, iq + + ; vector loop + mov wq, waq + and waq, ~(mmsize*2-1) + jmp .end_v +.loop_v: + mova m0, [src1q+iq] + mova m1, [src1q+iq+mmsize] + paddb m0, [src2q+iq] + paddb m1, [src2q+iq+mmsize] + mova [dstq+iq ], m0 + mova [dstq+iq+mmsize], m1 + add iq, mmsize*2 +.end_v: + cmp iq, waq + jl .loop_v + + ; scalar loop for leftover + jmp .end_s +.loop_s: + mov wab, [src1q+iq] + add wab, [src2q+iq] + mov [dstq+iq], wab + inc iq +.end_s: + cmp iq, wq + jl .loop_s + REP_RET +%endmacro + +INIT_MMX mmx +ADD_BYTES_FN 0 + +%macro ADD_PAETH_PRED_FN 1 +cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr +%if ARCH_X86_64 + movsxd bppq, bppd + movsxd wq, wd +%endif + lea endq, [dstq+wq-(mmsize/2-1)] + sub topq, dstq + sub srcq, dstq + sub dstq, bppq + pxor m7, m7 + movh m0, [dstq] + movh m1, [topq+dstq] + punpcklbw m0, m7 + punpcklbw m1, m7 + add dstq, bppq +.loop: + mova m2, m1 + movh m1, [topq+dstq] + mova m3, m2 + punpcklbw m1, m7 + mova m4, m2 + psubw m3, m1 + psubw m4, m0 + mova m5, m3 + paddw m5, m4 +%if cpuflag(ssse3) + pabsw m3, m3 + pabsw m4, m4 + pabsw m5, m5 +%else ; !cpuflag(ssse3) + psubw m7, m5 + pmaxsw m5, m7 + pxor m6, m6 + pxor m7, m7 + psubw m6, m3 + psubw m7, m4 + pmaxsw m3, m6 + pmaxsw m4, m7 + pxor m7, m7 +%endif ; cpuflag(ssse3) + mova m6, m4 + pminsw m6, m5 + pcmpgtw m3, m6 + pcmpgtw m4, m5 + mova m6, m4 + pand m4, m3 + pandn m6, m3 + pandn m3, m0 + movh m0, [srcq+dstq] + pand m6, m1 + pand m2, m4 + punpcklbw m0, m7 + paddw m0, m6 + paddw m3, m2 + paddw m0, m3 + pand m0, [pw_255] + mova m3, m0 + packuswb m3, m3 + movh [dstq], m3 + add dstq, bppq + cmp dstq, endq + jle .loop + REP_RET +%endmacro + +INIT_MMX mmx2 +ADD_PAETH_PRED_FN 0 + +INIT_MMX ssse3 +ADD_PAETH_PRED_FN 0 |