diff options
author | plepere <pierre-edouard.lepere@insa-rennes.fr> | 2014-06-13 13:29:17 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2014-06-17 13:23:36 +0200 |
commit | 92cccb7bcd79845020ed8abebf35170c182443b2 (patch) | |
tree | ae617971130a38912f7f8608f9406c39a84723db /libavcodec/x86/hevc_idct.asm | |
parent | fa0d0fb42ecda5d9676c744195fd9ef0454c259d (diff) | |
download | ffmpeg-92cccb7bcd79845020ed8abebf35170c182443b2.tar.gz |
avcodec/hevc: new idct + asm
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86/hevc_idct.asm')
-rw-r--r-- | libavcodec/x86/hevc_idct.asm | 180 |
1 files changed, 180 insertions, 0 deletions
diff --git a/libavcodec/x86/hevc_idct.asm b/libavcodec/x86/hevc_idct.asm new file mode 100644 index 0000000000..5d2f5b563c --- /dev/null +++ b/libavcodec/x86/hevc_idct.asm @@ -0,0 +1,180 @@ +; /* +; * Provide SSE & MMX idct functions for HEVC decoding +; * Copyright (c) 2014 Pierre-Edouard LEPERE +; * +; * This file is part of FFmpeg. +; * +; * FFmpeg is free software; you can redistribute it and/or +; * modify it under the terms of the GNU Lesser General Public +; * License as published by the Free Software Foundation; either +; * version 2.1 of the License, or (at your option) any later version. +; * +; * FFmpeg is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; * Lesser General Public License for more details. +; * +; * You should have received a copy of the GNU Lesser General Public +; * License along with FFmpeg; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +; */ +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA +max_pixels_10: times 8 dw ((1 << 10)-1) +dc_add_10: times 4 dd ((1 << 14-10) + 1) + + +SECTION .text + +;the idct_dc_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file + +%macro DC_ADD_INIT 2 + add %1w, ((1 << 14-8) + 1) + sar %1w, (15-8) + movd m0, %1 + lea %1, [%2*3] + SPLATW m0, m0, 0 + pxor m1, m1 + psubw m1, m0 + packuswb m0, m0 + packuswb m1, m1 +%endmacro + +%macro DC_ADD_OP 4 + %1 m2, [%2 ] + %1 m3, [%2+%3 ] + %1 m4, [%2+%3*2] + %1 m5, [%2+%4 ] + paddusb m2, m0 + paddusb m3, m0 + paddusb m4, m0 + paddusb m5, m0 + psubusb m2, m1 + psubusb m3, m1 + psubusb m4, m1 + psubusb m5, m1 + %1 [%2 ], m2 + %1 [%2+%3 ], m3 + %1 [%2+%3*2], m4 + %1 [%2+%4 ], m5 +%endmacro + +INIT_MMX mmxext +; void ff_hevc_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +%if ARCH_X86_64 +cglobal hevc_idct4_dc_add_8, 3, 4, 0 + movsx r3, word [r1] + DC_ADD_INIT r3, r2 + DC_ADD_OP movh, r0, r2, r3 + RET + +; void ff_hevc_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +cglobal hevc_idct8_dc_add_8, 3, 4, 0 + movsx r3, word [r1] + DC_ADD_INIT r3, r2 + DC_ADD_OP mova, r0, r2, r3 + lea r0, [r0+r2*4] + DC_ADD_OP mova, r0, r2, r3 + RET +%else +; void ff_hevc_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +cglobal hevc_idct4_dc_add_8, 2, 3, 0 + movsx r2, word [r1] + mov r1, r2m + DC_ADD_INIT r2, r1 + DC_ADD_OP movh, r0, r1, r2 + RET + +; void ff_hevc_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +cglobal hevc_idct8_dc_add_8, 2, 3, 0 + movsx r2, word [r1] + mov r1, r2m + DC_ADD_INIT r2, r1 + DC_ADD_OP mova, r0, r1, r2 + lea r0, [r0+r1*4] + DC_ADD_OP mova, r0, r1, r2 + RET +%endif + + +INIT_XMM sse2 +; void ff_hevc_idct16_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +cglobal hevc_idct16_dc_add_8, 3, 4, 0 + movsx r3, word [r1] + DC_ADD_INIT r3, r2 + DC_ADD_OP mova, r0, r2, r3 + lea r0, [r0+r2*4] + DC_ADD_OP mova, r0, r2, r3 + lea r0, [r0+r2*4] + DC_ADD_OP mova, r0, r2, r3 + lea r0, [r0+r2*4] + DC_ADD_OP mova, r0, r2, r3 + RET + +;----------------------------------------------------------------------------- +; void ff_hevc_idct_dc_add_10(pixel *dst, int16_t *block, int stride) +;----------------------------------------------------------------------------- +%macro IDCT_DC_ADD_OP_10 3 + pxor m5, m5 +%if avx_enabled + paddw m1, m0, [%1+0 ] + paddw m2, m0, [%1+%2 ] + paddw m3, m0, [%1+%2*2] + paddw m4, m0, [%1+%3 ] +%else + mova m1, [%1+0 ] + mova m2, [%1+%2 ] + mova m3, [%1+%2*2] + mova m4, [%1+%3 ] + paddw m1, m0 + paddw m2, m0 + paddw m3, m0 + paddw m4, m0 +%endif + CLIPW m1, m5, m6 + CLIPW m2, m5, m6 + CLIPW m3, m5, m6 + CLIPW m4, m5, m6 + mova [%1+0 ], m1 + mova [%1+%2 ], m2 + mova [%1+%2*2], m3 + mova [%1+%3 ], m4 +%endmacro + +INIT_MMX mmxext +cglobal hevc_idct4_dc_add_10,3,3 + mov r1w, [r1] + add r1w, ((1 << 4) + 1) + sar r1w, 5 + movd m0, r1d + lea r1, [r2*3] + SPLATW m0, m0, 0 + mova m6, [max_pixels_10] + IDCT_DC_ADD_OP_10 r0, r2, r1 + RET + +;----------------------------------------------------------------------------- +; void ff_hevc_idct8_dc_add_10(pixel *dst, int16_t *block, int stride) +;----------------------------------------------------------------------------- +%macro IDCT8_DC_ADD 0 +cglobal hevc_idct8_dc_add_10,3,4,7 + mov r1w, [r1] + add r1w, ((1 << 4) + 1) + sar r1w, 5 + movd m0, r1d + lea r1, [r2*3] + SPLATW m0, m0, 0 + mova m6, [max_pixels_10] + IDCT_DC_ADD_OP_10 r0, r2, r1 + lea r0, [r0+r2*4] + IDCT_DC_ADD_OP_10 r0, r2, r1 + RET +%endmacro + +INIT_XMM sse2 +IDCT8_DC_ADD +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +IDCT8_DC_ADD +%endif |