av_filter/x86/idet: MMX/SSE2 implementation of 16bits filter_line()

tested on http://ps-auxw.de/10bit-h264-sample/10bit-eldorado.mkv MMX: ~30% faster decoding overall SSE2:~40% faster Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
author: Pascal Massimino <pascal.massimino@gmail.com> 2014-09-09 14:38:58 +0200
committer: Michael Niedermayer <michaelni@gmx.at> 2014-09-09 16:47:22 +0200
commit: e3fd6a3a4e3d28d8a50bb6ec3e19449bc4e0d3db (patch)
tree: 4706c3e78a5046f13e80142b1f12cbbc0756e2b6 /libavfilter/x86/vf_idet.asm
parent: 881f96c4c2ef0c0162f63a370cbfff3c1e1feb2a (diff)
download: ffmpeg-e3fd6a3a4e3d28d8a50bb6ec3e19449bc4e0d3db.tar.gz
1 files changed, 68 insertions, 2 deletions
diff --git a/libavfilter/x86/vf_idet.asm b/libavfilter/x86/vf_idet.asm
index 14b16c5779..4649cae030 100644
--- a/libavfilter/x86/vf_idet.asm
+++ b/libavfilter/x86/vf_idet.asm
@@ -25,8 +25,6 @@
 
 SECTION_TEXT
 
-%if ARCH_X86_32
-
 ; Implementation that does 8-bytes at a time using single-word operations.
 %macro IDET_FILTER_LINE 1
 INIT_MMX %1
@@ -78,11 +76,79 @@ cglobal idet_filter_line, 4, 5, 0, a, b, c, width, index
     RET
 %endmacro
 
+%if ARCH_X86_32
 IDET_FILTER_LINE mmxext
 IDET_FILTER_LINE mmx
 %endif
 
+;******************************************************************************
+; 16bit implementation that does 4/8-pixels at a time
+
+%macro PABS_DIFF_WD 3    ; a, b, junk   , output=a
+  psubusw   %3, %2, %1
+  psubusw   %1, %2
+  por       %1, %3
+
+  mova      %2, %1
+  punpcklwd %1, m_zero
+  punpckhwd %2, m_zero
+  paddd     %1, %2
+%endmacro
+
+%macro IDET_FILTER_LINE_16BIT 1   ; %1=increment (4 or 8 words)
+cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index
+    xor       indexq, indexq
+%define m_zero m1
+%define m_sum  m0
+    pxor      m_sum, m_sum
+    pxor      m_zero, m_zero
+
+.loop_16bit:
+    movu      m2, [bq + indexq * 2]  ; B
+    movu      m3, [aq + indexq * 2]  ; A
+    mova      m6, m2
+    psubusw   m5, m2, m3             ; ba
+
+    movu      m4, [cq + indexq * 2]  ; C
+    add       indexq, %1
+    psubusw   m3, m2                 ; ab
+    CMP       indexd, widthd
+
+    psubusw   m6, m4                 ; bc
+    psubusw   m4, m2                 ; cb
+
+    PABS_DIFF_WD   m3, m6, m7        ; |ab - bc|
+    PABS_DIFF_WD   m5, m4, m7        ; |ba - cb|
+    paddd          m_sum, m3
+    paddd          m_sum, m5
+    jl        .loop_16bit
+
+    mova      m2, m_sum
+%if mmsize == 16
+    psrldq    m2, 4
+    paddd     m_sum, m2
+    psrldq    m2, 4
+    paddd     m_sum, m2
+    psrldq    m2, 4
+    paddd     m_sum, m2
+%else
+    psrlq     m2, 32
+    paddd     m_sum, m2
+%endif
+    movd      eax, m_sum
+    RET
+%endmacro
+
+INIT_XMM sse2
+IDET_FILTER_LINE_16BIT 8
+%if ARCH_X86_32
+INIT_MMX mmx
+IDET_FILTER_LINE_16BIT 4
+%endif
+
+;******************************************************************************
 ; SSE2 8-bit implementation that does 16-bytes at a time:
+
 INIT_XMM sse2
 cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total
     xor       indexq, indexq
author	Pascal Massimino <pascal.massimino@gmail.com>	2014-09-09 14:38:58 +0200
committer	Michael Niedermayer <michaelni@gmx.at>	2014-09-09 16:47:22 +0200
commit	e3fd6a3a4e3d28d8a50bb6ec3e19449bc4e0d3db (patch)
tree	4706c3e78a5046f13e80142b1f12cbbc0756e2b6 /libavfilter/x86/vf_idet.asm
parent	881f96c4c2ef0c0162f63a370cbfff3c1e1feb2a (diff)
download	ffmpeg-e3fd6a3a4e3d28d8a50bb6ec3e19449bc4e0d3db.tar.gz