diff options
author | Timothy Gu <timothygu99@gmail.com> | 2014-05-29 20:56:03 -0700 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2014-05-30 16:57:52 +0200 |
commit | 154cee9292840295bff07e31ebd92c8296d8aa2d (patch) | |
tree | 99f3d50f859d6086c38e9b899818b383f1c09938 | |
parent | 0b6292b7b85ccc1437bddbd939b7cf1f8a964b17 (diff) | |
download | ffmpeg-154cee9292840295bff07e31ebd92c8296d8aa2d.tar.gz |
x86: dsputilenc: convert ff_sse{8, 16}_mmx() to yasm
Signed-off-by: Timothy Gu <timothygu99@gmail.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r-- | libavcodec/x86/dsputilenc.asm | 48 | ||||
-rw-r--r-- | libavcodec/x86/dsputilenc_mmx.c | 145 |
2 files changed, 48 insertions, 145 deletions
diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm index de36b95a30..0628550145 100644 --- a/libavcodec/x86/dsputilenc.asm +++ b/libavcodec/x86/dsputilenc.asm @@ -278,19 +278,27 @@ INIT_XMM ssse3 %define ABS_SUM_8x8 ABS_SUM_8x8_64 HADAMARD8_DIFF 9 -INIT_XMM sse2 -; int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, -; int line_size, int h); -cglobal sse16, 5, 5, 8 - shr r4d, 1 +; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, +; int line_size, int h) + +%macro SUM_SQUARED_ERRORS 1 +cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h +%if %1 == mmsize + shr hd, 1 +%endif pxor m0, m0 ; mm0 = 0 pxor m7, m7 ; mm7 holds the sum .next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned - movu m1, [r1 ] ; mm1 = pix1[0][0-15] - movu m2, [r2 ] ; mm2 = pix2[0][0-15] - movu m3, [r1+r3] ; mm3 = pix1[1][0-15] - movu m4, [r2+r3] ; mm4 = pix2[1][0-15] + movu m1, [pix1q] ; m1 = pix1[0][0-15], [0-7] for mmx + movu m2, [pix2q] ; m2 = pix2[0][0-15], [0-7] for mmx +%if %1 == mmsize + movu m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx + movu m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx +%else ; %1 / 2 == mmsize; mmx only + mova m3, [pix1q+8] ; m3 = pix1[0][8-15] + mova m4, [pix2q+8] ; m4 = pix2[0][8-15] +%endif ; todo: mm1-mm2, mm3-mm4 ; algo: subtract mm1 from mm2 with saturation and vice versa @@ -319,20 +327,34 @@ cglobal sse16, 5, 5, 8 pmaddwd m1, m1 pmaddwd m3, m3 - lea r1, [r1+r3*2] ; pix1 += 2*line_size - lea r2, [r2+r3*2] ; pix2 += 2*line_size - paddd m1, m2 paddd m3, m4 paddd m7, m1 paddd m7, m3 - dec r4 +%if %1 == mmsize + lea pix1q, [pix1q + 2*lsizeq] + lea pix2q, [pix2q + 2*lsizeq] +%else + add pix1q, lsizeq + add pix2q, lsizeq +%endif + dec hd jnz .next2lines HADDD m7, m1 movd eax, m7 ; return value RET +%endmacro + +INIT_MMX mmx +SUM_SQUARED_ERRORS 8 + +INIT_MMX mmx +SUM_SQUARED_ERRORS 16 + +INIT_XMM sse2 +SUM_SQUARED_ERRORS 16 INIT_MMX mmx ; void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size) diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c index e12ebedfa6..13128d29ad 100644 --- a/libavcodec/x86/dsputilenc_mmx.c +++ b/libavcodec/x86/dsputilenc_mmx.c @@ -46,6 +46,10 @@ int ff_sum_abs_dctelem_mmx(int16_t *block); int ff_sum_abs_dctelem_mmxext(int16_t *block); int ff_sum_abs_dctelem_sse2(int16_t *block); int ff_sum_abs_dctelem_ssse3(int16_t *block); +int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h); +int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h); int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h); @@ -62,134 +66,7 @@ hadamard_func(ssse3) #if HAVE_INLINE_ASM -static int sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int tmp; - - __asm__ volatile ( - "movl %4, %%ecx \n" - "shr $1, %%ecx \n" - "pxor %%mm0, %%mm0 \n" /* mm0 = 0 */ - "pxor %%mm7, %%mm7 \n" /* mm7 holds the sum */ - "1: \n" - "movq (%0), %%mm1 \n" /* mm1 = pix1[0][0 - 7] */ - "movq (%1), %%mm2 \n" /* mm2 = pix2[0][0 - 7] */ - "movq (%0, %3), %%mm3 \n" /* mm3 = pix1[1][0 - 7] */ - "movq (%1, %3), %%mm4 \n" /* mm4 = pix2[1][0 - 7] */ - - /* todo: mm1-mm2, mm3-mm4 */ - /* algo: subtract mm1 from mm2 with saturation and vice versa */ - /* OR the results to get absolute difference */ - "movq %%mm1, %%mm5 \n" - "movq %%mm3, %%mm6 \n" - "psubusb %%mm2, %%mm1 \n" - "psubusb %%mm4, %%mm3 \n" - "psubusb %%mm5, %%mm2 \n" - "psubusb %%mm6, %%mm4 \n" - - "por %%mm1, %%mm2 \n" - "por %%mm3, %%mm4 \n" - - /* now convert to 16-bit vectors so we can square them */ - "movq %%mm2, %%mm1 \n" - "movq %%mm4, %%mm3 \n" - - "punpckhbw %%mm0, %%mm2 \n" - "punpckhbw %%mm0, %%mm4 \n" - "punpcklbw %%mm0, %%mm1 \n" /* mm1 now spread over (mm1, mm2) */ - "punpcklbw %%mm0, %%mm3 \n" /* mm4 now spread over (mm3, mm4) */ - - "pmaddwd %%mm2, %%mm2 \n" - "pmaddwd %%mm4, %%mm4 \n" - "pmaddwd %%mm1, %%mm1 \n" - "pmaddwd %%mm3, %%mm3 \n" - - "lea (%0, %3, 2), %0 \n" /* pix1 += 2 * line_size */ - "lea (%1, %3, 2), %1 \n" /* pix2 += 2 * line_size */ - - "paddd %%mm2, %%mm1 \n" - "paddd %%mm4, %%mm3 \n" - "paddd %%mm1, %%mm7 \n" - "paddd %%mm3, %%mm7 \n" - - "decl %%ecx \n" - "jnz 1b \n" - - "movq %%mm7, %%mm1 \n" - "psrlq $32, %%mm7 \n" /* shift hi dword to lo */ - "paddd %%mm7, %%mm1 \n" - "movd %%mm1, %2 \n" - : "+r" (pix1), "+r" (pix2), "=r" (tmp) - : "r" ((x86_reg) line_size), "m" (h) - : "%ecx"); - - return tmp; -} - -static int sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int tmp; - - __asm__ volatile ( - "movl %4, %%ecx\n" - "pxor %%mm0, %%mm0\n" /* mm0 = 0 */ - "pxor %%mm7, %%mm7\n" /* mm7 holds the sum */ - "1:\n" - "movq (%0), %%mm1\n" /* mm1 = pix1[0 - 7] */ - "movq (%1), %%mm2\n" /* mm2 = pix2[0 - 7] */ - "movq 8(%0), %%mm3\n" /* mm3 = pix1[8 - 15] */ - "movq 8(%1), %%mm4\n" /* mm4 = pix2[8 - 15] */ - - /* todo: mm1-mm2, mm3-mm4 */ - /* algo: subtract mm1 from mm2 with saturation and vice versa */ - /* OR the results to get absolute difference */ - "movq %%mm1, %%mm5\n" - "movq %%mm3, %%mm6\n" - "psubusb %%mm2, %%mm1\n" - "psubusb %%mm4, %%mm3\n" - "psubusb %%mm5, %%mm2\n" - "psubusb %%mm6, %%mm4\n" - - "por %%mm1, %%mm2\n" - "por %%mm3, %%mm4\n" - - /* now convert to 16-bit vectors so we can square them */ - "movq %%mm2, %%mm1\n" - "movq %%mm4, %%mm3\n" - - "punpckhbw %%mm0, %%mm2\n" - "punpckhbw %%mm0, %%mm4\n" - "punpcklbw %%mm0, %%mm1\n" /* mm1 now spread over (mm1, mm2) */ - "punpcklbw %%mm0, %%mm3\n" /* mm4 now spread over (mm3, mm4) */ - - "pmaddwd %%mm2, %%mm2\n" - "pmaddwd %%mm4, %%mm4\n" - "pmaddwd %%mm1, %%mm1\n" - "pmaddwd %%mm3, %%mm3\n" - - "add %3, %0\n" - "add %3, %1\n" - - "paddd %%mm2, %%mm1\n" - "paddd %%mm4, %%mm3\n" - "paddd %%mm1, %%mm7\n" - "paddd %%mm3, %%mm7\n" - - "decl %%ecx\n" - "jnz 1b\n" - - "movq %%mm7, %%mm1\n" - "psrlq $32, %%mm7\n" /* shift hi dword to lo */ - "paddd %%mm7, %%mm1\n" - "movd %%mm1, %2\n" - : "+r" (pix1), "+r" (pix2), "=r" (tmp) - : "r" ((x86_reg) line_size), "m" (h) - : "%ecx"); - - return tmp; -} +#if HAVE_YASM static int hf_noise8_mmx(uint8_t *pix1, int line_size, int h) { @@ -444,7 +321,7 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, if (c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h); else - score1 = sse16_mmx(c, pix1, pix2, line_size, h); + score1 = ff_sse16_mmx(c, pix1, pix2, line_size, h); score2 = hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h); @@ -457,7 +334,7 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { - int score1 = sse8_mmx(c, pix1, pix2, line_size, h); + int score1 = ff_sse8_mmx(c, pix1, pix2, line_size, h); int score2 = hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h); @@ -467,6 +344,8 @@ static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, return score1 + FFABS(score2) * 8; } +#endif /* HAVE_YASM */ + static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, int line_size, int h) { @@ -808,12 +687,12 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX)) c->fdct = ff_fdct_mmx; - c->sse[0] = sse16_mmx; - c->sse[1] = sse8_mmx; c->vsad[4] = vsad_intra16_mmx; +#if HAVE_YASM c->nsse[0] = nsse16_mmx; c->nsse[1] = nsse8_mmx; +#endif /* HAVE_YASM */ if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { c->vsad[0] = vsad16_mmx; c->try_8x8basis = try_8x8basis_mmx; @@ -860,6 +739,8 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx; c->hadamard8_diff[1] = ff_hadamard8_diff_mmx; c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx; + c->sse[0] = ff_sse16_mmx; + c->sse[1] = ff_sse8_mmx; } if (EXTERNAL_MMXEXT(cpu_flags)) { |