aboutsummaryrefslogtreecommitdiffstats
path: root/libavfilter/x86/vf_idet.asm
diff options
context:
space:
mode:
authorPascal Massimino <pascal.massimino@gmail.com>2014-09-09 14:38:58 +0200
committerMichael Niedermayer <michaelni@gmx.at>2014-09-09 16:47:22 +0200
commite3fd6a3a4e3d28d8a50bb6ec3e19449bc4e0d3db (patch)
tree4706c3e78a5046f13e80142b1f12cbbc0756e2b6 /libavfilter/x86/vf_idet.asm
parent881f96c4c2ef0c0162f63a370cbfff3c1e1feb2a (diff)
downloadffmpeg-e3fd6a3a4e3d28d8a50bb6ec3e19449bc4e0d3db.tar.gz
av_filter/x86/idet: MMX/SSE2 implementation of 16bits filter_line()
tested on http://ps-auxw.de/10bit-h264-sample/10bit-eldorado.mkv MMX: ~30% faster decoding overall SSE2:~40% faster Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavfilter/x86/vf_idet.asm')
-rw-r--r--libavfilter/x86/vf_idet.asm70
1 files changed, 68 insertions, 2 deletions
diff --git a/libavfilter/x86/vf_idet.asm b/libavfilter/x86/vf_idet.asm
index 14b16c5779..4649cae030 100644
--- a/libavfilter/x86/vf_idet.asm
+++ b/libavfilter/x86/vf_idet.asm
@@ -25,8 +25,6 @@
SECTION_TEXT
-%if ARCH_X86_32
-
; Implementation that does 8-bytes at a time using single-word operations.
%macro IDET_FILTER_LINE 1
INIT_MMX %1
@@ -78,11 +76,79 @@ cglobal idet_filter_line, 4, 5, 0, a, b, c, width, index
RET
%endmacro
+%if ARCH_X86_32
IDET_FILTER_LINE mmxext
IDET_FILTER_LINE mmx
%endif
+;******************************************************************************
+; 16bit implementation that does 4/8-pixels at a time
+
+%macro PABS_DIFF_WD 3 ; a, b, junk , output=a
+ psubusw %3, %2, %1
+ psubusw %1, %2
+ por %1, %3
+
+ mova %2, %1
+ punpcklwd %1, m_zero
+ punpckhwd %2, m_zero
+ paddd %1, %2
+%endmacro
+
+%macro IDET_FILTER_LINE_16BIT 1 ; %1=increment (4 or 8 words)
+cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index
+ xor indexq, indexq
+%define m_zero m1
+%define m_sum m0
+ pxor m_sum, m_sum
+ pxor m_zero, m_zero
+
+.loop_16bit:
+ movu m2, [bq + indexq * 2] ; B
+ movu m3, [aq + indexq * 2] ; A
+ mova m6, m2
+ psubusw m5, m2, m3 ; ba
+
+ movu m4, [cq + indexq * 2] ; C
+ add indexq, %1
+ psubusw m3, m2 ; ab
+ CMP indexd, widthd
+
+ psubusw m6, m4 ; bc
+ psubusw m4, m2 ; cb
+
+ PABS_DIFF_WD m3, m6, m7 ; |ab - bc|
+ PABS_DIFF_WD m5, m4, m7 ; |ba - cb|
+ paddd m_sum, m3
+ paddd m_sum, m5
+ jl .loop_16bit
+
+ mova m2, m_sum
+%if mmsize == 16
+ psrldq m2, 4
+ paddd m_sum, m2
+ psrldq m2, 4
+ paddd m_sum, m2
+ psrldq m2, 4
+ paddd m_sum, m2
+%else
+ psrlq m2, 32
+ paddd m_sum, m2
+%endif
+ movd eax, m_sum
+ RET
+%endmacro
+
+INIT_XMM sse2
+IDET_FILTER_LINE_16BIT 8
+%if ARCH_X86_32
+INIT_MMX mmx
+IDET_FILTER_LINE_16BIT 4
+%endif
+
+;******************************************************************************
; SSE2 8-bit implementation that does 16-bytes at a time:
+
INIT_XMM sse2
cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total
xor indexq, indexq