aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorplepere <pierre-edouard.lepere@insa-rennes.fr>2014-06-16 14:47:21 +0200
committerMichael Niedermayer <michaelni@gmx.at>2014-06-25 14:49:44 +0200
commit942e22c651166e8aa67bfffa7a431970200d3203 (patch)
tree5021462f1fc42fd5720fcb5e008d1cfef33ab881
parenta30f1b15c7ee559fecd14b381d1f4352e394ab5d (diff)
downloadffmpeg-942e22c651166e8aa67bfffa7a431970200d3203.tar.gz
avcodec/x86/hevc: add avx2 dc idct
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r--libavcodec/x86/hevc_idct.asm51
-rw-r--r--libavcodec/x86/hevcdsp.h6
-rw-r--r--libavcodec/x86/hevcdsp_init.c18
3 files changed, 72 insertions, 3 deletions
diff --git a/libavcodec/x86/hevc_idct.asm b/libavcodec/x86/hevc_idct.asm
index 6963dc78c5..31532ae907 100644
--- a/libavcodec/x86/hevc_idct.asm
+++ b/libavcodec/x86/hevc_idct.asm
@@ -20,12 +20,12 @@
; */
%include "libavutil/x86/x86util.asm"
-SECTION_RODATA
-max_pixels_10: times 8 dw ((1 << 10)-1)
+SECTION_RODATA 32
+max_pixels_10: times 16 dw ((1 << 10)-1)
dc_add_10: times 4 dd ((1 << 14-10) + 1)
-SECTION .text
+SECTION_TEXT 32
;the idct_dc_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
@@ -41,6 +41,18 @@ SECTION .text
packuswb m1, m1
%endmacro
+%macro DC_ADD_INIT_AVX2 2
+ add %1w, ((1 << 14-8) + 1)
+ sar %1w, (15-8)
+ movd xm0, %1d
+ vpbroadcastw m0, xm0 ;SPLATW
+ lea %1, [%2*3]
+ pxor m1, m1
+ psubw m1, m0
+ packuswb m0, m0
+ packuswb m1, m1
+%endmacro
+
%macro DC_ADD_OP 4
%1 m2, [%2 ]
%1 m3, [%2+%3 ]
@@ -112,6 +124,19 @@ cglobal hevc_idct16_dc_add_8, 3, 4, 0
DC_ADD_OP mova, r0, r2, r3
RET
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+; void ff_hevc_idct32_dc_add_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_idct32_dc_add_8, 3, 4, 6
+ movsx r3, word [r1]
+ DC_ADD_INIT_AVX2 r3, r2
+ DC_ADD_OP mova, r0, r2, r3,
+ %rep 7
+ lea r0, [r0+r2*4]
+ DC_ADD_OP mova, r0, r2, r3
+%endrep
+ RET
+%endif ;HAVE_AVX2_EXTERNAL
;-----------------------------------------------------------------------------
; void ff_hevc_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
;-----------------------------------------------------------------------------
@@ -178,3 +203,23 @@ IDCT8_DC_ADD
INIT_XMM avx
IDCT8_DC_ADD
%endif
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal hevc_idct16_dc_add_10,3,4,7
+ mov r1w, [r1]
+ add r1w, ((1 << 4) + 1)
+ sar r1w, 5
+ movd xm0, r1d
+ lea r1, [r2*3]
+ vpbroadcastw m0, xm0 ;SPLATW
+ mova m6, [max_pixels_10]
+ IDCT_DC_ADD_OP_10 r0, r2, r1
+ lea r0, [r0+r2*4]
+ IDCT_DC_ADD_OP_10 r0, r2, r1
+ lea r0, [r0+r2*4]
+ IDCT_DC_ADD_OP_10 r0, r2, r1
+ lea r0, [r0+r2*4]
+ IDCT_DC_ADD_OP_10 r0, r2, r1
+ RET
+%endif ;HAVE_AVX_EXTERNAL
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
index 029492eca3..661a860bd8 100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -133,6 +133,8 @@ idct_dc_proto(8, 8,mmxext);
idct_dc_proto(16,8, sse2);
idct_dc_proto(32,8, sse2);
+idct_dc_proto(32,8, avx2);
+
idct_dc_proto(4, 10,mmxext);
idct_dc_proto(8, 10, sse2);
@@ -142,6 +144,10 @@ idct_dc_proto(8, 10, avx);
idct_dc_proto(16,10, avx);
idct_dc_proto(32,10, avx);
+idct_dc_proto(16,10, avx2);
+idct_dc_proto(32,10, avx2);
+
+
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 58a0891e5b..cad236ddad 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -92,6 +92,17 @@ void ff_hevc_idct32_dc_add_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t strid
}
#endif //HAVE_AVX_EXTERNAL
+#if HAVE_AVX2_EXTERNAL
+
+void ff_hevc_idct32_dc_add_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+ ff_hevc_idct16_dc_add_10_avx2(dst, coeffs, stride);
+ ff_hevc_idct16_dc_add_10_avx2(dst+32, coeffs, stride);
+ ff_hevc_idct16_dc_add_10_avx2(dst+16*stride, coeffs, stride);
+ ff_hevc_idct16_dc_add_10_avx2(dst+16*stride+32, coeffs, stride);
+}
+#endif //HAVE_AVX2_EXTERNAL
+
#define mc_rep_func(name, bitd, step, W, opt) \
void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst, ptrdiff_t dststride, \
uint8_t *_src, ptrdiff_t _srcstride, int height, \
@@ -438,6 +449,9 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 8, sse4);
QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 8, sse4);
}
+ if (EXTERNAL_AVX2(mm_flags)) {
+ c->transform_dc_add[3] = ff_hevc_idct32_dc_add_8_avx2;
+ }
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(mm_flags)) {
c->transform_dc_add[0] = ff_hevc_idct4_dc_add_10_mmxext;
@@ -473,6 +487,10 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->transform_dc_add[2] = ff_hevc_idct16_dc_add_10_avx;
c->transform_dc_add[3] = ff_hevc_idct32_dc_add_10_avx;
}
+ if (EXTERNAL_AVX2(mm_flags)) {
+ c->transform_dc_add[2] = ff_hevc_idct16_dc_add_10_avx2;
+ c->transform_dc_add[3] = ff_hevc_idct32_dc_add_10_avx2;
+ }
}
}