diff options
author | Paul B Mahol <onemda@gmail.com> | 2019-10-17 21:05:50 +0200 |
---|---|---|
committer | Paul B Mahol <onemda@gmail.com> | 2019-10-17 21:05:50 +0200 |
commit | 0ae6fb276bd9deba88077bd6f9cbe5d4b84e7f6f (patch) | |
tree | b488a1dc78b98bad84c204a35e10869fe684bd01 /libavfilter/x86/vf_atadenoise.asm | |
parent | 71e33c6e01f60f7ea9fe0ae10f244dbde64ca3b9 (diff) | |
download | ffmpeg-0ae6fb276bd9deba88077bd6f9cbe5d4b84e7f6f.tar.gz |
avfilter/x86/vf_atadenoise: add SIMD for serial too
Diffstat (limited to 'libavfilter/x86/vf_atadenoise.asm')
-rw-r--r-- | libavfilter/x86/vf_atadenoise.asm | 125 |
1 files changed, 125 insertions, 0 deletions
diff --git a/libavfilter/x86/vf_atadenoise.asm b/libavfilter/x86/vf_atadenoise.asm index 44b9c4f160..5466d1f2d4 100644 --- a/libavfilter/x86/vf_atadenoise.asm +++ b/libavfilter/x86/vf_atadenoise.asm @@ -151,4 +151,129 @@ cglobal atadenoise_filter_row8, 8,10,13, src, dst, srcf, w, mid, size, i, j, src jl .loop RET +INIT_XMM sse4 +cglobal atadenoise_filter_row8_serial, 8,10,13, src, dst, srcf, w, mid, size, i, j, srcfx, x + movsxdifnidn wq, wd + movsxdifnidn midq, midd + movsxdifnidn sizeq, sized + add srcq, wq + add dstq, wq + mov xq, wq + dec sizeq + neg xq + movd m4, r6m + SPLATW m4, m4 + movd m5, r7m + SPLATW m5, m5 + pxor m2, m2 + mova m10, [pw_ones] + + .loop: + mov iq, midq + mov jq, midq + pxor m3, m3 + pxor m11, m11 + movu m0, [srcq + xq] + punpcklbw m0, m2 + mova m7, m0 + mova m8, [pw_one] + mova m12, [pw_ones] + + .loop0: + dec jq + + mov srcfxq, [srcfq + jq * 8] + add srcfxq, wq + + movu m1, [srcfxq + xq] + punpcklbw m1, m2 + mova m9, m1 + psubw m1, m0 + pabsw m1, m1 + paddw m11, m1 + pcmpgtw m1, m4 + mova m6, m11 + pcmpgtw m6, m5 + por m6, m1 + pxor m6, m10 + pand m12, m6 + pand m9, m12 + paddw m7, m9 + mova m6, m12 + psrlw m6, 15 + paddw m8, m6 + + ptest m12, m12 + jz .end_loop0 + + cmp jq, 0 + jg .loop0 + + .end_loop0: + mova m12, [pw_ones] + + .loop1: + inc iq + + mov srcfxq, [srcfq + iq * 8] + add srcfxq, wq + + movu m1, [srcfxq + xq] + punpcklbw m1, m2 + mova m9, m1 + psubw m1, m0 + pabsw m1, m1 + paddw m3, m1 + pcmpgtw m1, m4 + mova m6, m3 + pcmpgtw m6, m5 + por m6, m1 + pxor m6, m10 + pand m12, m6 + pand m9, m12 + paddw m7, m9 + mova m6, m12 + psrlw m6, 15 + paddw m8, m6 + + ptest m12, m12 + jz .finish + + cmp iq, sizeq + jl .loop1 + + .finish: + mova m9, m8 + psrlw m9, 1 + paddw m7, m9 + + mova m1, m7 + mova m6, m8 + + punpcklwd m7, m2 + punpcklwd m8, m2 + cvtdq2ps m7, m7 + cvtdq2ps m8, m8 + divps m7, m8 + cvttps2dq m7, m7 + packssdw m7, m7 + packuswb m7, m7 + + movd [dstq + xq], m7 + + punpckhwd m1, m2 + punpckhwd m6, m2 + cvtdq2ps m1, m1 + cvtdq2ps m6, m6 + divps m1, m6 + cvttps2dq m1, m1 + packssdw m1, m1 + packuswb m1, m1 + + movd [dstq + xq + 4], m1 + + add xq, mmsize/2 + jl .loop + RET + %endif |