diff options
author | James Almer <jamrial@gmail.com> | 2014-09-24 19:53:07 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2014-09-24 21:52:13 -0300 |
commit | 4f4f08e6f00f9949c27e2f76bb7f732e6d16871e (patch) | |
tree | 1c277162e923ff1ea8f001890c8ce33b65f821de | |
parent | c99a8828148949430608c3866e671a599a5805ba (diff) | |
download | ffmpeg-4f4f08e6f00f9949c27e2f76bb7f732e6d16871e.tar.gz |
x86/idctdsp: port {put,add}_pixels_clamped to yasm
Also add sse2 versions for both.
put_pixels_clamped port and sse2 version originally written by Timothy Gu.
Reviewed-by: Michael Niedermayer <michaelni@gmx.at>
Signed-off-by: James Almer <jamrial@gmail.com>
-rw-r--r-- | libavcodec/x86/Makefile | 3 | ||||
-rw-r--r-- | libavcodec/x86/idctdsp.asm | 103 | ||||
-rw-r--r-- | libavcodec/x86/idctdsp.h | 4 | ||||
-rw-r--r-- | libavcodec/x86/idctdsp_init.c | 7 | ||||
-rw-r--r-- | libavcodec/x86/idctdsp_mmx.c | 134 |
5 files changed, 112 insertions, 139 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 7bf0e82ac0..9f34abddde 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -66,8 +66,7 @@ OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o # subsystems MMX-OBJS-$(CONFIG_DIRAC_DECODER) += x86/dirac_dwt.o MMX-OBJS-$(CONFIG_FDCTDSP) += x86/fdct.o -MMX-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_mmx.o \ - x86/simple_idct.o +MMX-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct.o # decoders/encoders MMX-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct_mmx.o \ diff --git a/libavcodec/x86/idctdsp.asm b/libavcodec/x86/idctdsp.asm index c3f37c4910..0aa73459e2 100644 --- a/libavcodec/x86/idctdsp.asm +++ b/libavcodec/x86/idctdsp.asm @@ -78,3 +78,106 @@ INIT_MMX mmx PUT_SIGNED_PIXELS_CLAMPED 0 INIT_XMM sse2 PUT_SIGNED_PIXELS_CLAMPED 3 + +;-------------------------------------------------------------------------- +; void ff_put_pixels_clamped(const int16_t *block, uint8_t *pixels, +; ptrdiff_t line_size); +;-------------------------------------------------------------------------- +; %1 = block offset +%macro PUT_PIXELS_CLAMPED_HALF 1 + mova m0, [blockq+mmsize*0+%1] + mova m1, [blockq+mmsize*2+%1] +%if mmsize == 8 + mova m2, [blockq+mmsize*4+%1] + mova m3, [blockq+mmsize*6+%1] +%endif + packuswb m0, [blockq+mmsize*1+%1] + packuswb m1, [blockq+mmsize*3+%1] +%if mmsize == 8 + packuswb m2, [blockq+mmsize*5+%1] + packuswb m3, [blockq+mmsize*7+%1] + movq [pixelsq], m0 + movq [lsizeq+pixelsq], m1 + movq [2*lsizeq+pixelsq], m2 + movq [lsize3q+pixelsq], m3 +%else + movq [pixelsq], m0 + movhps [lsizeq+pixelsq], m0 + movq [2*lsizeq+pixelsq], m1 + movhps [lsize3q+pixelsq], m1 +%endif +%endmacro + +%macro PUT_PIXELS_CLAMPED 0 +cglobal put_pixels_clamped, 3, 4, 2, block, pixels, lsize, lsize3 + lea lsize3q, [lsizeq*3] + PUT_PIXELS_CLAMPED_HALF 0 + lea pixelsq, [pixelsq+lsizeq*4] + PUT_PIXELS_CLAMPED_HALF 64 + RET +%endmacro + +INIT_MMX mmx +PUT_PIXELS_CLAMPED +INIT_XMM sse2 +PUT_PIXELS_CLAMPED + +;-------------------------------------------------------------------------- +; void ff_add_pixels_clamped(const int16_t *block, uint8_t *pixels, +; ptrdiff_t line_size); +;-------------------------------------------------------------------------- +; %1 = block offset +%macro ADD_PIXELS_CLAMPED 1 + mova m0, [blockq+mmsize*0+%1] + mova m1, [blockq+mmsize*1+%1] +%if mmsize == 8 + mova m5, [blockq+mmsize*2+%1] + mova m6, [blockq+mmsize*3+%1] +%endif + movq m2, [pixelsq] + movq m3, [pixelsq+lsizeq] +%if mmsize == 8 + mova m7, m2 + punpcklbw m2, m4 + punpckhbw m7, m4 + paddsw m0, m2 + paddsw m1, m7 + mova m7, m3 + punpcklbw m3, m4 + punpckhbw m7, m4 + paddsw m5, m3 + paddsw m6, m7 +%else + punpcklbw m2, m4 + punpcklbw m3, m4 + paddsw m0, m2 + paddsw m1, m3 +%endif + packuswb m0, m1 +%if mmsize == 8 + packuswb m5, m6 + movq [pixelsq], m0 + movq [pixelsq+lsizeq], m5 +%else + movq [pixelsq], m0 + movhps [pixelsq+lsizeq], m0 +%endif +%endmacro + +%macro ADD_PIXELS_CLAMPED 0 +cglobal add_pixels_clamped, 3, 3, 5, block, pixels, lsize + pxor m4, m4 + ADD_PIXELS_CLAMPED 0 + lea pixelsq, [pixelsq+lsizeq*2] + ADD_PIXELS_CLAMPED 32 + lea pixelsq, [pixelsq+lsizeq*2] + ADD_PIXELS_CLAMPED 64 + lea pixelsq, [pixelsq+lsizeq*2] + ADD_PIXELS_CLAMPED 96 + RET +%endmacro + +INIT_MMX mmx +ADD_PIXELS_CLAMPED +INIT_XMM sse2 +ADD_PIXELS_CLAMPED diff --git a/libavcodec/x86/idctdsp.h b/libavcodec/x86/idctdsp.h index 252e704338..6408df1e32 100644 --- a/libavcodec/x86/idctdsp.h +++ b/libavcodec/x86/idctdsp.h @@ -23,8 +23,12 @@ void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, ptrdiff_t line_size); +void ff_add_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels, + ptrdiff_t line_size); void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, ptrdiff_t line_size); +void ff_put_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels, + ptrdiff_t line_size); void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, ptrdiff_t line_size); void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels, diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c index 3f438f44db..2c26a98850 100644 --- a/libavcodec/x86/idctdsp_init.c +++ b/libavcodec/x86/idctdsp_init.c @@ -64,9 +64,6 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, int cpu_flags = av_get_cpu_flags(); if (INLINE_MMX(cpu_flags)) { - c->put_pixels_clamped = ff_put_pixels_clamped_mmx; - c->add_pixels_clamped = ff_add_pixels_clamped_mmx; - if (!high_bit_depth && avctx->lowres == 0 && (avctx->idct_algo == FF_IDCT_AUTO || @@ -80,8 +77,12 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, } if (EXTERNAL_MMX(cpu_flags)) { c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx; + c->put_pixels_clamped = ff_put_pixels_clamped_mmx; + c->add_pixels_clamped = ff_add_pixels_clamped_mmx; } if (EXTERNAL_SSE2(cpu_flags)) { c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2; + c->put_pixels_clamped = ff_put_pixels_clamped_sse2; + c->add_pixels_clamped = ff_add_pixels_clamped_sse2; } } diff --git a/libavcodec/x86/idctdsp_mmx.c b/libavcodec/x86/idctdsp_mmx.c deleted file mode 100644 index 0ce9a92390..0000000000 --- a/libavcodec/x86/idctdsp_mmx.c +++ /dev/null @@ -1,134 +0,0 @@ -/* - * SIMD-optimized IDCT-related routines - * Copyright (c) 2000, 2001 Fabrice Bellard - * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> - * - * MMX optimization by Nick Kurshev <nickols_k@mail.ru> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" -#include "libavcodec/idctdsp.h" -#include "idctdsp.h" -#include "inline_asm.h" - -#if HAVE_INLINE_ASM - -void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, - ptrdiff_t line_size) -{ - const int16_t *p; - uint8_t *pix; - - /* read the pixels */ - p = block; - pix = pixels; - /* unrolled loop */ - __asm__ volatile ( - "movq (%3), %%mm0 \n\t" - "movq 8(%3), %%mm1 \n\t" - "movq 16(%3), %%mm2 \n\t" - "movq 24(%3), %%mm3 \n\t" - "movq 32(%3), %%mm4 \n\t" - "movq 40(%3), %%mm5 \n\t" - "movq 48(%3), %%mm6 \n\t" - "movq 56(%3), %%mm7 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "packuswb %%mm7, %%mm6 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm2, (%0, %1) \n\t" - "movq %%mm4, (%0, %1, 2) \n\t" - "movq %%mm6, (%0, %2) \n\t" - :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3), - "r" (p) - : "memory"); - pix += line_size * 4; - p += 32; - - // if here would be an exact copy of the code above - // compiler would generate some very strange code - // thus using "r" - __asm__ volatile ( - "movq (%3), %%mm0 \n\t" - "movq 8(%3), %%mm1 \n\t" - "movq 16(%3), %%mm2 \n\t" - "movq 24(%3), %%mm3 \n\t" - "movq 32(%3), %%mm4 \n\t" - "movq 40(%3), %%mm5 \n\t" - "movq 48(%3), %%mm6 \n\t" - "movq 56(%3), %%mm7 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "packuswb %%mm7, %%mm6 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm2, (%0, %1) \n\t" - "movq %%mm4, (%0, %1, 2) \n\t" - "movq %%mm6, (%0, %2) \n\t" - :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3), - "r" (p) - : "memory"); -} - -void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, - ptrdiff_t line_size) -{ - const int16_t *p; - uint8_t *pix; - int i; - - /* read the pixels */ - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - i = 4; - do { - __asm__ volatile ( - "movq (%2), %%mm0 \n\t" - "movq 8(%2), %%mm1 \n\t" - "movq 16(%2), %%mm2 \n\t" - "movq 24(%2), %%mm3 \n\t" - "movq %0, %%mm4 \n\t" - "movq %1, %%mm6 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddsw %%mm4, %%mm0 \n\t" - "paddsw %%mm5, %%mm1 \n\t" - "movq %%mm6, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm6 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddsw %%mm6, %%mm2 \n\t" - "paddsw %%mm5, %%mm3 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "movq %%mm0, %0 \n\t" - "movq %%mm2, %1 \n\t" - : "+m" (*pix), "+m" (*(pix + line_size)) - : "r" (p) - : "memory"); - pix += line_size * 2; - p += 16; - } while (--i); -} - -#endif /* HAVE_INLINE_ASM */ |