aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRonald S. Bultje <rsbultje@gmail.com>2013-12-24 16:17:03 -0500
committerAnton Khirnov <anton@khirnov.net>2016-08-03 10:57:55 +0200
commit3a09494939ddb2f2fd0f8d015162d5174ec07d4c (patch)
tree3b570e55aef4e557c65050334e2acb3b0e7a9e83
parent89466de4aeaf5e359489b81b8a9920a2bc7936d6 (diff)
downloadffmpeg-3a09494939ddb2f2fd0f8d015162d5174ec07d4c.tar.gz
vp9mc/x86: add 16px functions (64bit only).
Signed-off-by: Anton Khirnov <anton@khirnov.net>
-rw-r--r--libavcodec/x86/vp9dsp_init.c5
-rw-r--r--libavcodec/x86/vp9mc.asm122
2 files changed, 127 insertions, 0 deletions
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index 833d983ab1..dc08e60662 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -64,6 +64,9 @@ ff_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst, \
mc_funcs(4);
mc_funcs(8);
+#if ARCH_X86_64
+mc_funcs(16);
+#endif
#undef mc_funcs
#undef mc_func
@@ -95,7 +98,9 @@ ff_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst, \
mc_rep_func(put, sz, hsz, v, ssse3); \
mc_rep_func(avg, sz, hsz, v, ssse3)
+#if ARCH_X86_32
mc_rep_funcs(16, 8);
+#endif
mc_rep_funcs(32, 16);
mc_rep_funcs(64, 32);
diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm
index 59e56687f2..152715c9b9 100644
--- a/libavcodec/x86/vp9mc.asm
+++ b/libavcodec/x86/vp9mc.asm
@@ -144,6 +144,62 @@ INIT_XMM ssse3
filter_h_fn put
filter_h_fn avg
+%if ARCH_X86_64
+%macro filter_hx2_fn 1
+%assign %%px mmsize
+cglobal %1_8tap_1d_h_ %+ %%px, 6, 6, 14, dst, src, dstride, sstride, h, filtery
+ mova m13, [pw_256]
+ mova m8, [filteryq+ 0]
+ mova m9, [filteryq+16]
+ mova m10, [filteryq+32]
+ mova m11, [filteryq+48]
+.loop:
+ movu m0, [srcq-3]
+ movu m1, [srcq-2]
+ movu m2, [srcq-1]
+ movu m3, [srcq+0]
+ movu m4, [srcq+1]
+ movu m5, [srcq+2]
+ movu m6, [srcq+3]
+ movu m7, [srcq+4]
+ add srcq, sstrideq
+ SBUTTERFLY bw, 0, 1, 12
+ SBUTTERFLY bw, 2, 3, 12
+ SBUTTERFLY bw, 4, 5, 12
+ SBUTTERFLY bw, 6, 7, 12
+ pmaddubsw m0, m8
+ pmaddubsw m1, m8
+ pmaddubsw m2, m9
+ pmaddubsw m3, m9
+ pmaddubsw m4, m10
+ pmaddubsw m5, m10
+ pmaddubsw m6, m11
+ pmaddubsw m7, m11
+ paddw m0, m2
+ paddw m1, m3
+ paddw m4, m6
+ paddw m5, m7
+ paddsw m0, m4
+ paddsw m1, m5
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ packuswb m0, m1
+%ifidn %1, avg
+ pavgb m0, [dstq]
+%endif
+ mova [dstq], m0
+ add dstq, dstrideq
+ dec hd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM ssse3
+filter_hx2_fn put
+filter_hx2_fn avg
+
+%endif ; ARCH_X86_64
+
%macro filter_v_fn 1
%assign %%px mmsize/2
%if ARCH_X86_64
@@ -218,6 +274,72 @@ INIT_XMM ssse3
filter_v_fn put
filter_v_fn avg
+%if ARCH_X86_64
+
+%macro filter_vx2_fn 1
+%assign %%px mmsize
+cglobal %1_8tap_1d_v_ %+ %%px, 6, 8, 14, dst, src, dstride, sstride, h, filtery, src4, sstride3
+ sub srcq, sstrideq
+ lea sstride3q, [sstrideq*3]
+ sub srcq, sstrideq
+ mova m13, [pw_256]
+ sub srcq, sstrideq
+ mova m8, [filteryq+ 0]
+ lea src4q, [srcq+sstrideq*4]
+ mova m9, [filteryq+16]
+ mova m10, [filteryq+32]
+ mova m11, [filteryq+48]
+.loop:
+ ; FIXME maybe reuse loads from previous rows, or just
+ ; more generally unroll this to prevent multiple loads of
+ ; the same data?
+ movu m0, [srcq]
+ movu m1, [srcq+sstrideq]
+ movu m2, [srcq+sstrideq*2]
+ movu m3, [srcq+sstride3q]
+ movu m4, [src4q]
+ movu m5, [src4q+sstrideq]
+ movu m6, [src4q+sstrideq*2]
+ movu m7, [src4q+sstride3q]
+ add srcq, sstrideq
+ add src4q, sstrideq
+ SBUTTERFLY bw, 0, 1, 12
+ SBUTTERFLY bw, 2, 3, 12
+ SBUTTERFLY bw, 4, 5, 12
+ SBUTTERFLY bw, 6, 7, 12
+ pmaddubsw m0, m8
+ pmaddubsw m1, m8
+ pmaddubsw m2, m9
+ pmaddubsw m3, m9
+ pmaddubsw m4, m10
+ pmaddubsw m5, m10
+ pmaddubsw m6, m11
+ pmaddubsw m7, m11
+ paddw m0, m2
+ paddw m1, m3
+ paddw m4, m6
+ paddw m5, m7
+ paddsw m0, m4
+ paddsw m1, m5
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ packuswb m0, m1
+%ifidn %1, avg
+ pavgb m0, [dstq]
+%endif
+ mova [dstq], m0
+ add dstq, dstrideq
+ dec hd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM ssse3
+filter_vx2_fn put
+filter_vx2_fn avg
+
+%endif ; ARCH_X86_64
+
%macro fpel_fn 6
%if %2 == 4
%define %%srcfn movh