aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86
diff options
context:
space:
mode:
authorRostislav Pehlivanov <rpehlivanov@ob-encoder.com>2016-06-23 18:06:55 +0100
committerRostislav Pehlivanov <atomnuker@gmail.com>2016-07-11 23:33:24 +0100
commitbd61f3c6bfb83d7691e124a02394ae76737c26f4 (patch)
treece64a9726dc8247b20ec32ff323bb88fecf6e79a /libavcodec/x86
parent80721cc1ff1f1c8c460c136184ed6416a73b4bfd (diff)
downloadffmpeg-bd61f3c6bfb83d7691e124a02394ae76737c26f4.tar.gz
diracdsp: add SIMD for the 10 bit version of put_signed_rect_clamped
Signed-off-by: Rostislav Pehlivanov <rpehlivanov@obe.tv>
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/diracdsp.asm42
-rw-r--r--libavcodec/x86/diracdsp_init.c4
2 files changed, 46 insertions, 0 deletions
diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
index 8e9f0fbf02..d86b5438c5 100644
--- a/libavcodec/x86/diracdsp.asm
+++ b/libavcodec/x86/diracdsp.asm
@@ -22,6 +22,8 @@
SECTION_RODATA
pw_7: times 8 dw 7
+convert_to_unsigned_10bit: times 4 dd 0x200
+clip_10bit: times 8 dw 0x3ff
cextern pw_3
cextern pw_16
@@ -300,3 +302,43 @@ cglobal dequant_subband_32, 7, 7, 4, src, dst, stride, qf, qs, tot_v, tot_h
jg .loop_v
RET
+
+%if ARCH_X86_64 == 1
+; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height)
+cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, src_stride, w, h
+ mov r6, srcq
+ mov r7, dstq
+ mov r8, wq
+ pxor m2, m2
+ mova m3, [clip_10bit]
+ mova m4, [convert_to_unsigned_10bit]
+
+ .loop_h:
+ mov srcq, r6
+ mov dstq, r7
+ mov wq, r8
+
+ .loop_w:
+ movu m0, [srcq+0*mmsize]
+ movu m1, [srcq+1*mmsize]
+
+ paddd m0, m4
+ paddd m1, m4
+ packusdw m0, m0, m1
+ CLIPW m0, m2, m3 ; packusdw saturates so it's fine
+
+ movu [dstq], m0
+
+ add srcq, 2*mmsize
+ add dstq, 1*mmsize
+ sub wd, 8
+ jg .loop_w
+
+ add r6, src_strideq
+ add r7, dst_strideq
+ sub hd, 1
+ jg .loop_h
+
+ RET
+
+%endif
diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c
index 26b885d530..43aab6a863 100644
--- a/libavcodec/x86/diracdsp_init.c
+++ b/libavcodec/x86/diracdsp_init.c
@@ -45,6 +45,9 @@ void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, i
void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+#if ARCH_X86_64
+void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height);
+#endif
void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h);
@@ -189,5 +192,6 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
if (EXTERNAL_SSE4(mm_flags)) {
c->dequant_subband[1] = ff_dequant_subband_32_sse4;
+ c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
}
}