diff options
author | Rostislav Pehlivanov <rpehlivanov@ob-encoder.com> | 2016-06-23 18:06:55 +0100 |
---|---|---|
committer | Rostislav Pehlivanov <atomnuker@gmail.com> | 2016-07-11 23:33:24 +0100 |
commit | bd61f3c6bfb83d7691e124a02394ae76737c26f4 (patch) | |
tree | ce64a9726dc8247b20ec32ff323bb88fecf6e79a | |
parent | 80721cc1ff1f1c8c460c136184ed6416a73b4bfd (diff) | |
download | ffmpeg-bd61f3c6bfb83d7691e124a02394ae76737c26f4.tar.gz |
diracdsp: add SIMD for the 10 bit version of put_signed_rect_clamped
Signed-off-by: Rostislav Pehlivanov <rpehlivanov@obe.tv>
-rw-r--r-- | libavcodec/x86/diracdsp.asm | 42 | ||||
-rw-r--r-- | libavcodec/x86/diracdsp_init.c | 4 |
2 files changed, 46 insertions, 0 deletions
diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm index 8e9f0fbf02..d86b5438c5 100644 --- a/libavcodec/x86/diracdsp.asm +++ b/libavcodec/x86/diracdsp.asm @@ -22,6 +22,8 @@ SECTION_RODATA pw_7: times 8 dw 7 +convert_to_unsigned_10bit: times 4 dd 0x200 +clip_10bit: times 8 dw 0x3ff cextern pw_3 cextern pw_16 @@ -300,3 +302,43 @@ cglobal dequant_subband_32, 7, 7, 4, src, dst, stride, qf, qs, tot_v, tot_h jg .loop_v RET + +%if ARCH_X86_64 == 1 +; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height) +cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, src_stride, w, h + mov r6, srcq + mov r7, dstq + mov r8, wq + pxor m2, m2 + mova m3, [clip_10bit] + mova m4, [convert_to_unsigned_10bit] + + .loop_h: + mov srcq, r6 + mov dstq, r7 + mov wq, r8 + + .loop_w: + movu m0, [srcq+0*mmsize] + movu m1, [srcq+1*mmsize] + + paddd m0, m4 + paddd m1, m4 + packusdw m0, m0, m1 + CLIPW m0, m2, m3 ; packusdw saturates so it's fine + + movu [dstq], m0 + + add srcq, 2*mmsize + add dstq, 1*mmsize + sub wd, 8 + jg .loop_w + + add r6, src_strideq + add r7, dst_strideq + sub hd, 1 + jg .loop_h + + RET + +%endif diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c index 26b885d530..43aab6a863 100644 --- a/libavcodec/x86/diracdsp_init.c +++ b/libavcodec/x86/diracdsp_init.c @@ -45,6 +45,9 @@ void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, i void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); +#if ARCH_X86_64 +void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height); +#endif void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h); @@ -189,5 +192,6 @@ void ff_diracdsp_init_x86(DiracDSPContext* c) if (EXTERNAL_SSE4(mm_flags)) { c->dequant_subband[1] = ff_dequant_subband_32_sse4; + c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4; } } |