diff options
author | Martin Vignali <martin.vignali@gmail.com> | 2017-12-02 19:46:42 +0100 |
---|---|---|
committer | Martin Vignali <martin.vignali@gmail.com> | 2017-12-09 15:19:03 +0100 |
commit | 630967ef63d0f2a5cc12b06815af0ec6cb5c9d2a (patch) | |
tree | 6c85ea7d96b89876c994e4ded30a42da7149813b /libavcodec/x86 | |
parent | 4353c3506742c9fecce4cf9f68cc6a7ab7ea05b1 (diff) | |
download | ffmpeg-630967ef63d0f2a5cc12b06815af0ec6cb5c9d2a.tar.gz |
avcodec/utvideodec : add SIMD (SSSE3 and AVX2) for gradient_pred
Diffstat (limited to 'libavcodec/x86')
-rw-r--r-- | libavcodec/x86/lossless_videodsp.asm | 80 | ||||
-rw-r--r-- | libavcodec/x86/lossless_videodsp_init.c | 5 |
2 files changed, 85 insertions, 0 deletions
diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm index cfa0620fd1..9a169fe314 100644 --- a/libavcodec/x86/lossless_videodsp.asm +++ b/libavcodec/x86/lossless_videodsp.asm @@ -2,6 +2,7 @@ ;* SIMD lossless video DSP utils ;* Copyright (c) 2008 Loren Merritt ;* Copyright (c) 2014 Michael Niedermayer +;* Copyright (c) 2017 Jokyo Images ;* ;* This file is part of FFmpeg. ;* @@ -325,3 +326,82 @@ cglobal add_left_pred_int16, 4,4,8, dst, src, mask, w, left ADD_HFYU_LEFT_LOOP_INT16 u, a .src_unaligned: ADD_HFYU_LEFT_LOOP_INT16 u, u + + +;--------------------------------------------------------------------------------------------- +; void add_gradient_pred(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width) +;--------------------------------------------------------------------------------------------- +%macro ADD_GRADIENT_PRED 0 +cglobal add_gradient_pred, 3,4,5, src, stride, width, tmp + mova xm0, [pb_15] + +;load src - 1 in xm1 + movd xm1, [srcq-1] +%if cpuflag(avx2) + vpbroadcastb xm1, xm1 +%else + pxor xm2, xm2 + pshufb xm1, xm2 +%endif + + add srcq, widthq + neg widthq + neg strideq + +.loop: + lea tmpq, [srcq + strideq] + mova m2, [tmpq + widthq] ; A = src[x-stride] + movu m3, [tmpq + widthq - 1] ; B = src[x - (stride + 1)] + mova m4, [srcq + widthq] ; current val (src[x]) + + psubb m2, m3; A - B + +; prefix sum A-B + pslldq m3, m2, 1 + paddb m2, m3 + pslldq m3, m2, 2 + paddb m2, m3 + pslldq m3, m2, 4 + paddb m2, m3 + pslldq m3, m2, 8 + paddb m2, m3 + +; prefix sum current val + pslldq m3, m4, 1 + paddb m4, m3 + pslldq m3, m4, 2 + paddb m4, m3 + pslldq m3, m4, 4 + paddb m4, m3 + pslldq m3, m4, 8 + paddb m4, m3 + +; last sum + paddb m2, m4 ; current + (A - B) + + paddb xm1, xm2 ; += C + mova [srcq + widthq], xm1 ; store + + pshufb xm1, xm0 ; put last val in all val of xm1 + +%if mmsize == 32 + vextracti128 xm2, m2, 1 ; get second lane of the ymm + paddb xm1, xm2; += C + + mova [srcq + widthq + 16], xm1 ; store + pshufb xm1, xm0 ; put last val in all val of m1 +%endif + + add widthq, mmsize + jl .loop + RET + +%endmacro + +INIT_XMM ssse3 +ADD_GRADIENT_PRED + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +ADD_GRADIENT_PRED +%endif diff --git a/libavcodec/x86/lossless_videodsp_init.c b/libavcodec/x86/lossless_videodsp_init.c index beae317cc2..e3063de462 100644 --- a/libavcodec/x86/lossless_videodsp_init.c +++ b/libavcodec/x86/lossless_videodsp_init.c @@ -44,6 +44,9 @@ int ff_add_left_pred_unaligned_avx2(uint8_t *dst, const uint8_t *src, int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc); int ff_add_left_pred_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc); +void ff_add_gradient_pred_ssse3(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width); +void ff_add_gradient_pred_avx2(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width); + #if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32 static void add_median_pred_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, ptrdiff_t w, @@ -109,6 +112,7 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c) if (EXTERNAL_SSSE3(cpu_flags)) { c->add_left_pred = ff_add_left_pred_ssse3; c->add_left_pred_int16 = ff_add_left_pred_int16_ssse3; + c->add_gradient_pred = ff_add_gradient_pred_ssse3; } if (EXTERNAL_SSSE3_FAST(cpu_flags)) { @@ -121,5 +125,6 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c) if (EXTERNAL_AVX2_FAST(cpu_flags)) { c->add_bytes = ff_add_bytes_avx2; c->add_left_pred = ff_add_left_pred_unaligned_avx2; + c->add_gradient_pred = ff_add_gradient_pred_avx2; } } |