diff options
author | James Almer <jamrial@gmail.com> | 2014-05-24 01:30:28 -0300 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2014-05-24 21:46:25 +0200 |
commit | 61eea421b23f124139683add1dadcd6036050dc6 (patch) | |
tree | b2b07dc620c95daa9685b045201156d817f00124 /libavcodec | |
parent | ddeb58b90c41d1eea21122f30c9d78b30744ea88 (diff) | |
download | ffmpeg-61eea421b23f124139683add1dadcd6036050dc6.tar.gz |
x86/dsputilenc: port sum_abs_dctelem functions to yasm
Signed-off-by: James Almer <jamrial@gmail.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/x86/dsputilenc.asm | 42 | ||||
-rw-r--r-- | libavcodec/x86/dsputilenc_mmx.c | 130 |
2 files changed, 53 insertions, 119 deletions
diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm index 1f496ad32e..c06f28b978 100644 --- a/libavcodec/x86/dsputilenc.asm +++ b/libavcodec/x86/dsputilenc.asm @@ -487,3 +487,45 @@ cglobal pix_norm1, 2, 4 movd eax, m1 RET +%macro DCT_SAD4 1 + mova m2, [blockq+%1+0 ] + mova m3, [blockq+%1+16] + mova m4, [blockq+%1+32] + mova m5, [blockq+%1+48] + ABS1_SUM m2, m6, m0 + ABS1_SUM m3, m6, m1 + ABS1_SUM m4, m6, m0 + ABS1_SUM m5, m6, m1 +%endmacro + +;----------------------------------------------- +;int ff_sum_abs_dctelem(int16_t *block) +;----------------------------------------------- +; %1 = number of xmm registers used + +%macro SUM_ABS_DCTELEM 1 +cglobal sum_abs_dctelem, 1, 1, %1, block + pxor m0, m0 + pxor m1, m1 + DCT_SAD4 0 +%if mmsize == 8 + DCT_SAD4 8 +%endif + DCT_SAD4 64 +%if mmsize == 8 + DCT_SAD4 72 +%endif + paddusw m0, m1 + HSUM m0, m1, eax + and eax, 0xFFFF + RET +%endmacro + +INIT_MMX mmx +SUM_ABS_DCTELEM 0 +INIT_MMX mmxext +SUM_ABS_DCTELEM 0 +INIT_XMM sse2 +SUM_ABS_DCTELEM 7 +INIT_XMM ssse3 +SUM_ABS_DCTELEM 6 diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c index 1100fb07f7..e63d510ab9 100644 --- a/libavcodec/x86/dsputilenc_mmx.c +++ b/libavcodec/x86/dsputilenc_mmx.c @@ -38,6 +38,10 @@ void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2, int stride); int ff_pix_sum16_mmx(uint8_t *pix, int line_size); int ff_pix_norm1_mmx(uint8_t *pix, int line_size); +int ff_sum_abs_dctelem_mmx(int16_t *block); +int ff_sum_abs_dctelem_mmxext(int16_t *block); +int ff_sum_abs_dctelem_sse2(int16_t *block); +int ff_sum_abs_dctelem_ssse3(int16_t *block); #if HAVE_INLINE_ASM @@ -759,118 +763,6 @@ static void sub_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *src1, *left = src2[w - 1]; } -#define MMABS_MMX(a,z) \ - "pxor " #z ", " #z " \n\t" \ - "pcmpgtw " #a ", " #z " \n\t" \ - "pxor " #z ", " #a " \n\t" \ - "psubw " #z ", " #a " \n\t" - -#define MMABS_MMXEXT(a, z) \ - "pxor " #z ", " #z " \n\t" \ - "psubw " #a ", " #z " \n\t" \ - "pmaxsw " #z ", " #a " \n\t" - -#define MMABS_SSSE3(a,z) \ - "pabsw " #a ", " #a " \n\t" - -#define MMABS_SUM(a,z, sum) \ - MMABS(a,z) \ - "paddusw " #a ", " #sum " \n\t" - -/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get - * up to about 100k on extreme inputs. But that's very unlikely to occur in - * natural video, and it's even more unlikely to not have any alternative - * mvs/modes with lower cost. */ -#define HSUM_MMX(a, t, dst) \ - "movq " #a ", " #t " \n\t" \ - "psrlq $32, " #a " \n\t" \ - "paddusw " #t ", " #a " \n\t" \ - "movq " #a ", " #t " \n\t" \ - "psrlq $16, " #a " \n\t" \ - "paddusw " #t ", " #a " \n\t" \ - "movd " #a ", " #dst " \n\t" \ - -#define HSUM_MMXEXT(a, t, dst) \ - "pshufw $0x0E, " #a ", " #t " \n\t" \ - "paddusw " #t ", " #a " \n\t" \ - "pshufw $0x01, " #a ", " #t " \n\t" \ - "paddusw " #t ", " #a " \n\t" \ - "movd " #a ", " #dst " \n\t" \ - -#define HSUM_SSE2(a, t, dst) \ - "movhlps " #a ", " #t " \n\t" \ - "paddusw " #t ", " #a " \n\t" \ - "pshuflw $0x0E, " #a ", " #t " \n\t" \ - "paddusw " #t ", " #a " \n\t" \ - "pshuflw $0x01, " #a ", " #t " \n\t" \ - "paddusw " #t ", " #a " \n\t" \ - "movd " #a ", " #dst " \n\t" \ - -#define DCT_SAD4(m, mm, o) \ - "mov"#m" "#o" + 0(%1), " #mm "2 \n\t" \ - "mov"#m" "#o" + 16(%1), " #mm "3 \n\t" \ - "mov"#m" "#o" + 32(%1), " #mm "4 \n\t" \ - "mov"#m" "#o" + 48(%1), " #mm "5 \n\t" \ - MMABS_SUM(mm ## 2, mm ## 6, mm ## 0) \ - MMABS_SUM(mm ## 3, mm ## 7, mm ## 1) \ - MMABS_SUM(mm ## 4, mm ## 6, mm ## 0) \ - MMABS_SUM(mm ## 5, mm ## 7, mm ## 1) \ - -#define DCT_SAD_MMX \ - "pxor %%mm0, %%mm0 \n\t" \ - "pxor %%mm1, %%mm1 \n\t" \ - DCT_SAD4(q, %%mm, 0) \ - DCT_SAD4(q, %%mm, 8) \ - DCT_SAD4(q, %%mm, 64) \ - DCT_SAD4(q, %%mm, 72) \ - "paddusw %%mm1, %%mm0 \n\t" \ - HSUM(%%mm0, %%mm1, %0) - -#define DCT_SAD_SSE2 \ - "pxor %%xmm0, %%xmm0 \n\t" \ - "pxor %%xmm1, %%xmm1 \n\t" \ - DCT_SAD4(dqa, %%xmm, 0) \ - DCT_SAD4(dqa, %%xmm, 64) \ - "paddusw %%xmm1, %%xmm0 \n\t" \ - HSUM(%%xmm0, %%xmm1, %0) - -#define DCT_SAD_FUNC(cpu) \ -static int sum_abs_dctelem_ ## cpu(int16_t *block) \ -{ \ - int sum; \ - __asm__ volatile ( \ - DCT_SAD \ - :"=r"(sum) \ - :"r"(block)); \ - return sum & 0xFFFF; \ -} - -#define DCT_SAD DCT_SAD_MMX -#define HSUM(a, t, dst) HSUM_MMX(a, t, dst) -#define MMABS(a, z) MMABS_MMX(a, z) -DCT_SAD_FUNC(mmx) -#undef MMABS -#undef HSUM - -#define HSUM(a, t, dst) HSUM_MMXEXT(a, t, dst) -#define MMABS(a, z) MMABS_MMXEXT(a, z) -DCT_SAD_FUNC(mmxext) -#undef HSUM -#undef DCT_SAD - -#define DCT_SAD DCT_SAD_SSE2 -#define HSUM(a, t, dst) HSUM_SSE2(a, t, dst) -DCT_SAD_FUNC(sse2) -#undef MMABS - -#if HAVE_SSSE3_INLINE -#define MMABS(a, z) MMABS_SSSE3(a, z) -DCT_SAD_FUNC(ssse3) -#undef MMABS -#endif -#undef HSUM -#undef DCT_SAD - static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size) { @@ -1012,8 +904,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, c->fdct = ff_fdct_mmx; c->diff_bytes = diff_bytes_mmx; - c->sum_abs_dctelem = sum_abs_dctelem_mmx; - c->sse[0] = sse16_mmx; c->sse[1] = sse8_mmx; c->vsad[4] = vsad_intra16_mmx; @@ -1041,7 +931,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX)) c->fdct = ff_fdct_mmxext; - c->sum_abs_dctelem = sum_abs_dctelem_mmxext; c->vsad[4] = vsad_intra16_mmxext; if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { @@ -1055,8 +944,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, if (!high_bit_depth && (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX)) c->fdct = ff_fdct_sse2; - - c->sum_abs_dctelem = sum_abs_dctelem_sse2; } #if HAVE_SSSE3_INLINE @@ -1065,7 +952,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, c->try_8x8basis = try_8x8basis_ssse3; } c->add_8x8basis = add_8x8basis_ssse3; - c->sum_abs_dctelem = sum_abs_dctelem_ssse3; } #endif #endif /* HAVE_INLINE_ASM */ @@ -1073,15 +959,18 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, if (EXTERNAL_MMX(cpu_flags)) { c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx; c->hadamard8_diff[1] = ff_hadamard8_diff_mmx; + c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx; } if (EXTERNAL_MMXEXT(cpu_flags)) { c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext; c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext; + c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext; } if (EXTERNAL_SSE2(cpu_flags)) { c->sse[0] = ff_sse16_sse2; + c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2; #if HAVE_ALIGNED_STACK c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2; @@ -1089,9 +978,12 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, #endif } - if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) { + if (EXTERNAL_SSSE3(cpu_flags)) { + c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3; +#if HAVE_ALIGNED_STACK c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3; c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3; +#endif } ff_dsputil_init_pix_mmx(c, avctx); |