diff options
author | Pascal Massimino <pascal.massimino@gmail.com> | 2014-09-09 14:38:58 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2014-09-09 16:47:22 +0200 |
commit | e3fd6a3a4e3d28d8a50bb6ec3e19449bc4e0d3db (patch) | |
tree | 4706c3e78a5046f13e80142b1f12cbbc0756e2b6 /libavfilter/x86/vf_idet.asm | |
parent | 881f96c4c2ef0c0162f63a370cbfff3c1e1feb2a (diff) | |
download | ffmpeg-e3fd6a3a4e3d28d8a50bb6ec3e19449bc4e0d3db.tar.gz |
av_filter/x86/idet: MMX/SSE2 implementation of 16bits filter_line()
tested on http://ps-auxw.de/10bit-h264-sample/10bit-eldorado.mkv
MMX: ~30% faster decoding overall
SSE2:~40% faster
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavfilter/x86/vf_idet.asm')
-rw-r--r-- | libavfilter/x86/vf_idet.asm | 70 |
1 files changed, 68 insertions, 2 deletions
diff --git a/libavfilter/x86/vf_idet.asm b/libavfilter/x86/vf_idet.asm index 14b16c5779..4649cae030 100644 --- a/libavfilter/x86/vf_idet.asm +++ b/libavfilter/x86/vf_idet.asm @@ -25,8 +25,6 @@ SECTION_TEXT -%if ARCH_X86_32 - ; Implementation that does 8-bytes at a time using single-word operations. %macro IDET_FILTER_LINE 1 INIT_MMX %1 @@ -78,11 +76,79 @@ cglobal idet_filter_line, 4, 5, 0, a, b, c, width, index RET %endmacro +%if ARCH_X86_32 IDET_FILTER_LINE mmxext IDET_FILTER_LINE mmx %endif +;****************************************************************************** +; 16bit implementation that does 4/8-pixels at a time + +%macro PABS_DIFF_WD 3 ; a, b, junk , output=a + psubusw %3, %2, %1 + psubusw %1, %2 + por %1, %3 + + mova %2, %1 + punpcklwd %1, m_zero + punpckhwd %2, m_zero + paddd %1, %2 +%endmacro + +%macro IDET_FILTER_LINE_16BIT 1 ; %1=increment (4 or 8 words) +cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index + xor indexq, indexq +%define m_zero m1 +%define m_sum m0 + pxor m_sum, m_sum + pxor m_zero, m_zero + +.loop_16bit: + movu m2, [bq + indexq * 2] ; B + movu m3, [aq + indexq * 2] ; A + mova m6, m2 + psubusw m5, m2, m3 ; ba + + movu m4, [cq + indexq * 2] ; C + add indexq, %1 + psubusw m3, m2 ; ab + CMP indexd, widthd + + psubusw m6, m4 ; bc + psubusw m4, m2 ; cb + + PABS_DIFF_WD m3, m6, m7 ; |ab - bc| + PABS_DIFF_WD m5, m4, m7 ; |ba - cb| + paddd m_sum, m3 + paddd m_sum, m5 + jl .loop_16bit + + mova m2, m_sum +%if mmsize == 16 + psrldq m2, 4 + paddd m_sum, m2 + psrldq m2, 4 + paddd m_sum, m2 + psrldq m2, 4 + paddd m_sum, m2 +%else + psrlq m2, 32 + paddd m_sum, m2 +%endif + movd eax, m_sum + RET +%endmacro + +INIT_XMM sse2 +IDET_FILTER_LINE_16BIT 8 +%if ARCH_X86_32 +INIT_MMX mmx +IDET_FILTER_LINE_16BIT 4 +%endif + +;****************************************************************************** ; SSE2 8-bit implementation that does 16-bytes at a time: + INIT_XMM sse2 cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total xor indexq, indexq |