diff options
author | Martin Vignali <martin.vignali@gmail.com> | 2017-12-02 19:09:58 +0100 |
---|---|---|
committer | Martin Vignali <martin.vignali@gmail.com> | 2017-12-09 15:16:03 +0100 |
commit | 4353c3506742c9fecce4cf9f68cc6a7ab7ea05b1 (patch) | |
tree | 5107ae8a627ffa2b6d7f496f5115c481c289f369 /libavcodec | |
parent | cfbcea1cca7f4d5b92a17778f78427794057eb29 (diff) | |
download | ffmpeg-4353c3506742c9fecce4cf9f68cc6a7ab7ea05b1.tar.gz |
avcodec/x86/lossless_videodsp : add avx2 version for add_left_pred
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/x86/lossless_videodsp.asm | 63 | ||||
-rw-r--r-- | libavcodec/x86/lossless_videodsp_init.c | 3 |
2 files changed, 44 insertions, 22 deletions
diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm index 663bf6153e..cfa0620fd1 100644 --- a/libavcodec/x86/lossless_videodsp.asm +++ b/libavcodec/x86/lossless_videodsp.asm @@ -114,40 +114,54 @@ MEDIAN_PRED add dstq, wq neg wq %%.loop: + pshufb xm0, xm5 %if %2 mova m1, [srcq+wq] %else movu m1, [srcq+wq] %endif - mova m2, m1 - psllw m1, 8 + psllw m2, m1, 8 paddb m1, m2 - mova m2, m1 - pshufb m1, m3 + pshufb m2, m1, m3 paddb m1, m2 - pshufb m0, m5 - mova m2, m1 - pshufb m1, m4 + pshufb m2, m1, m4 paddb m1, m2 -%if mmsize == 16 - mova m2, m1 - pshufb m1, m6 +%if mmsize >= 16 + pshufb m2, m1, m6 paddb m1, m2 %endif - paddb m0, m1 + paddb xm0, xm1 %if %1 - mova [dstq+wq], m0 + mova [dstq+wq], xm0 %else - movq [dstq+wq], m0 - movhps [dstq+wq+8], m0 + movq [dstq+wq], xm0 + movhps [dstq+wq+8], xm0 +%endif + +%if mmsize == 32 + vextracti128 xm2, m1, 1 ; get second lane of the ymm + pshufb xm0, xm5 ; set alls val to last val of the first lane + paddb xm0, xm2 +;store val +%if %1 + mova [dstq+wq+16], xm0 +%else; + movq [dstq+wq+16], xm0 + movhps [dstq+wq+16+8], xm0 +%endif %endif add wq, mmsize jl %%.loop +%if mmsize == 32 + mov eax, [dstq -1] + and eax, 0xff +%else; mov eax, mmsize-1 sub eax, wd movd m1, eax pshufb m0, m1 movd eax, m0 +%endif RET %endmacro @@ -166,15 +180,15 @@ cglobal add_left_pred, 3,3,7, dst, src, w, left %macro ADD_LEFT_PRED_UNALIGNED 0 cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left - mova m5, [pb_15] - mova m6, [pb_zzzzzzzz77777777] - mova m4, [pb_zzzz3333zzzzbbbb] - mova m3, [pb_zz11zz55zz99zzdd] - movd m0, leftm - pslldq m0, 15 - test srcq, 15 + mova xm5, [pb_15] + VBROADCASTI128 m6, [pb_zzzzzzzz77777777] + VBROADCASTI128 m4, [pb_zzzz3333zzzzbbbb] + VBROADCASTI128 m3, [pb_zz11zz55zz99zzdd] + movd xm0, leftm + pslldq xm0, 15 + test srcq, mmsize - 1 jnz .src_unaligned - test dstq, 15 + test dstq, mmsize - 1 jnz .dst_unaligned ADD_LEFT_LOOP 1, 1 .dst_unaligned: @@ -186,6 +200,11 @@ cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left INIT_XMM ssse3 ADD_LEFT_PRED_UNALIGNED +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +ADD_LEFT_PRED_UNALIGNED +%endif + ;------------------------------------------------------------------------------ ; void ff_add_bytes(uint8_t *dst, uint8_t *src, ptrdiff_t w); ;------------------------------------------------------------------------------ diff --git a/libavcodec/x86/lossless_videodsp_init.c b/libavcodec/x86/lossless_videodsp_init.c index 4f20c1ce92..beae317cc2 100644 --- a/libavcodec/x86/lossless_videodsp_init.c +++ b/libavcodec/x86/lossless_videodsp_init.c @@ -38,6 +38,8 @@ int ff_add_left_pred_ssse3(uint8_t *dst, const uint8_t *src, ptrdiff_t w, int left); int ff_add_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src, ptrdiff_t w, int left); +int ff_add_left_pred_unaligned_avx2(uint8_t *dst, const uint8_t *src, + ptrdiff_t w, int left); int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc); int ff_add_left_pred_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc); @@ -118,5 +120,6 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c) } if (EXTERNAL_AVX2_FAST(cpu_flags)) { c->add_bytes = ff_add_bytes_avx2; + c->add_left_pred = ff_add_left_pred_unaligned_avx2; } } |