aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86
diff options
context:
space:
mode:
authorMartin Vignali <martin.vignali@gmail.com>2017-12-02 19:46:42 +0100
committerMartin Vignali <martin.vignali@gmail.com>2017-12-09 15:19:03 +0100
commit630967ef63d0f2a5cc12b06815af0ec6cb5c9d2a (patch)
tree6c85ea7d96b89876c994e4ded30a42da7149813b /libavcodec/x86
parent4353c3506742c9fecce4cf9f68cc6a7ab7ea05b1 (diff)
downloadffmpeg-630967ef63d0f2a5cc12b06815af0ec6cb5c9d2a.tar.gz
avcodec/utvideodec : add SIMD (SSSE3 and AVX2) for gradient_pred
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/lossless_videodsp.asm80
-rw-r--r--libavcodec/x86/lossless_videodsp_init.c5
2 files changed, 85 insertions, 0 deletions
diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm
index cfa0620fd1..9a169fe314 100644
--- a/libavcodec/x86/lossless_videodsp.asm
+++ b/libavcodec/x86/lossless_videodsp.asm
@@ -2,6 +2,7 @@
;* SIMD lossless video DSP utils
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2014 Michael Niedermayer
+;* Copyright (c) 2017 Jokyo Images
;*
;* This file is part of FFmpeg.
;*
@@ -325,3 +326,82 @@ cglobal add_left_pred_int16, 4,4,8, dst, src, mask, w, left
ADD_HFYU_LEFT_LOOP_INT16 u, a
.src_unaligned:
ADD_HFYU_LEFT_LOOP_INT16 u, u
+
+
+;---------------------------------------------------------------------------------------------
+; void add_gradient_pred(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width)
+;---------------------------------------------------------------------------------------------
+%macro ADD_GRADIENT_PRED 0
+cglobal add_gradient_pred, 3,4,5, src, stride, width, tmp
+ mova xm0, [pb_15]
+
+;load src - 1 in xm1
+ movd xm1, [srcq-1]
+%if cpuflag(avx2)
+ vpbroadcastb xm1, xm1
+%else
+ pxor xm2, xm2
+ pshufb xm1, xm2
+%endif
+
+ add srcq, widthq
+ neg widthq
+ neg strideq
+
+.loop:
+ lea tmpq, [srcq + strideq]
+ mova m2, [tmpq + widthq] ; A = src[x-stride]
+ movu m3, [tmpq + widthq - 1] ; B = src[x - (stride + 1)]
+ mova m4, [srcq + widthq] ; current val (src[x])
+
+ psubb m2, m3; A - B
+
+; prefix sum A-B
+ pslldq m3, m2, 1
+ paddb m2, m3
+ pslldq m3, m2, 2
+ paddb m2, m3
+ pslldq m3, m2, 4
+ paddb m2, m3
+ pslldq m3, m2, 8
+ paddb m2, m3
+
+; prefix sum current val
+ pslldq m3, m4, 1
+ paddb m4, m3
+ pslldq m3, m4, 2
+ paddb m4, m3
+ pslldq m3, m4, 4
+ paddb m4, m3
+ pslldq m3, m4, 8
+ paddb m4, m3
+
+; last sum
+ paddb m2, m4 ; current + (A - B)
+
+ paddb xm1, xm2 ; += C
+ mova [srcq + widthq], xm1 ; store
+
+ pshufb xm1, xm0 ; put last val in all val of xm1
+
+%if mmsize == 32
+ vextracti128 xm2, m2, 1 ; get second lane of the ymm
+ paddb xm1, xm2; += C
+
+ mova [srcq + widthq + 16], xm1 ; store
+ pshufb xm1, xm0 ; put last val in all val of m1
+%endif
+
+ add widthq, mmsize
+ jl .loop
+ RET
+
+%endmacro
+
+INIT_XMM ssse3
+ADD_GRADIENT_PRED
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+ADD_GRADIENT_PRED
+%endif
diff --git a/libavcodec/x86/lossless_videodsp_init.c b/libavcodec/x86/lossless_videodsp_init.c
index beae317cc2..e3063de462 100644
--- a/libavcodec/x86/lossless_videodsp_init.c
+++ b/libavcodec/x86/lossless_videodsp_init.c
@@ -44,6 +44,9 @@ int ff_add_left_pred_unaligned_avx2(uint8_t *dst, const uint8_t *src,
int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
int ff_add_left_pred_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
+void ff_add_gradient_pred_ssse3(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width);
+void ff_add_gradient_pred_avx2(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width);
+
#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
static void add_median_pred_cmov(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, ptrdiff_t w,
@@ -109,6 +112,7 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c)
if (EXTERNAL_SSSE3(cpu_flags)) {
c->add_left_pred = ff_add_left_pred_ssse3;
c->add_left_pred_int16 = ff_add_left_pred_int16_ssse3;
+ c->add_gradient_pred = ff_add_gradient_pred_ssse3;
}
if (EXTERNAL_SSSE3_FAST(cpu_flags)) {
@@ -121,5 +125,6 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c)
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
c->add_bytes = ff_add_bytes_avx2;
c->add_left_pred = ff_add_left_pred_unaligned_avx2;
+ c->add_gradient_pred = ff_add_gradient_pred_avx2;
}
}