diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2013-01-21 16:34:24 +0100 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2013-01-21 16:34:37 +0100 |
commit | f6a80d6e6318e929d22738c0f0ead30bb3ea28da (patch) | |
tree | bd148fe9f2b84dff03f2ff814044b4d8f8ac3682 | |
parent | cf4515ecd985aab899f24a6b7d18432001e670fa (diff) | |
parent | 9f00b1cbababa08dd220dbc0c74286a4707be746 (diff) | |
download | ffmpeg-f6a80d6e6318e929d22738c0f0ead30bb3ea28da.tar.gz |
Merge remote-tracking branch 'qatar/master'
* qatar/master:
dsputilenc: x86: Convert pixel inline asm to yasm
libgsm: detect libgsm header path
Merged-by: Michael Niedermayer <michaelni@gmx.at>
-rwxr-xr-x | configure | 5 | ||||
-rw-r--r-- | libavcodec/libgsm.c | 5 | ||||
-rw-r--r-- | libavcodec/x86/dsputilenc.asm | 152 | ||||
-rw-r--r-- | libavcodec/x86/dsputilenc_mmx.c | 201 |
4 files changed, 181 insertions, 182 deletions
@@ -1381,6 +1381,7 @@ HAVE_LIST=" gettimeofday glob gnu_as + gsm_h ibm_asm inet_aton io_h @@ -3839,7 +3840,9 @@ enabled libfdk_aac && require libfdk_aac fdk-aac/aacenc_lib.h aacEncOpen -lfdk- flite_libs="-lflite_cmu_time_awb -lflite_cmu_us_awb -lflite_cmu_us_kal -lflite_cmu_us_kal16 -lflite_cmu_us_rms -lflite_cmu_us_slt -lflite_usenglish -lflite_cmulex -lflite" enabled libflite && require2 libflite "flite/flite.h" flite_init $flite_libs enabled libfreetype && require_pkg_config freetype2 "ft2build.h freetype/freetype.h" FT_Init_FreeType -enabled libgsm && require libgsm gsm/gsm.h gsm_create -lgsm +enabled libgsm && { for gsm_hdr in "gsm.h" "gsm/gsm.h"; do + check_lib "${gsm_hdr}" gsm_create -lgsm && break; + done || die "ERROR: libgsm not found"; } enabled libilbc && require libilbc ilbc.h WebRtcIlbcfix_InitDecode -lilbc enabled libmodplug && require libmodplug libmodplug/modplug.h ModPlug_Load -lmodplug enabled libmp3lame && require "libmp3lame >= 3.98.3" lame/lame.h lame_set_VBR_quality -lmp3lame diff --git a/libavcodec/libgsm.c b/libavcodec/libgsm.c index f3cfb91f17..fd476e02e2 100644 --- a/libavcodec/libgsm.c +++ b/libavcodec/libgsm.c @@ -27,7 +27,12 @@ // The idiosyncrasies of GSM-in-WAV are explained at http://kbs.cs.tu-berlin.de/~jutta/toast.html +#include "config.h" +#if HAVE_GSM_H +#include <gsm.h> +#else #include <gsm/gsm.h> +#endif #include "libavutil/channel_layout.h" #include "libavutil/common.h" diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm index b1dc99e2a2..55eacf5b96 100644 --- a/libavcodec/x86/dsputilenc.asm +++ b/libavcodec/x86/dsputilenc.asm @@ -333,3 +333,155 @@ cglobal sse16, 5, 5, 8 paddd m7, m1 movd eax, m7 ; return value RET + +INIT_MMX mmx +; get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) +cglobal get_pixels, 3,4 + movsxdifnidn r2, r2d + add r0, 128 + mov r3, -128 + pxor m7, m7 +.loop: + mova m0, [r1] + mova m2, [r1+r2] + mova m1, m0 + mova m3, m2 + punpcklbw m0, m7 + punpckhbw m1, m7 + punpcklbw m2, m7 + punpckhbw m3, m7 + mova [r0+r3+ 0], m0 + mova [r0+r3+ 8], m1 + mova [r0+r3+16], m2 + mova [r0+r3+24], m3 + lea r1, [r1+r2*2] + add r3, 32 + js .loop + REP_RET + +INIT_XMM sse2 +cglobal get_pixels, 3, 4 + movsxdifnidn r2, r2d + lea r3, [r2*3] + pxor m4, m4 + movh m0, [r1] + movh m1, [r1+r2] + movh m2, [r1+r2*2] + movh m3, [r1+r3] + lea r1, [r1+r2*4] + punpcklbw m0, m4 + punpcklbw m1, m4 + punpcklbw m2, m4 + punpcklbw m3, m4 + mova [r0], m0 + mova [r0+0x10], m1 + mova [r0+0x20], m2 + mova [r0+0x30], m3 + movh m0, [r1] + movh m1, [r1+r2*1] + movh m2, [r1+r2*2] + movh m3, [r1+r3] + punpcklbw m0, m4 + punpcklbw m1, m4 + punpcklbw m2, m4 + punpcklbw m3, m4 + mova [r0+0x40], m0 + mova [r0+0x50], m1 + mova [r0+0x60], m2 + mova [r0+0x70], m3 + RET + +INIT_MMX mmx +; diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const unint8_t *s2, stride) +cglobal diff_pixels, 4,5 + movsxdifnidn r3, r3d + pxor m7, m7 + add r0, 128 + mov r4, -128 +.loop: + mova m0, [r1] + mova m2, [r2] + mova m1, m0 + mova m3, m2 + punpcklbw m0, m7 + punpckhbw m1, m7 + punpcklbw m2, m7 + punpckhbw m3, m7 + psubw m0, m2 + psubw m1, m3 + mova [r0+r4+0], m0 + mova [r0+r4+8], m1 + add r1, r3 + add r2, r3 + add r4, 16 + jne .loop + REP_RET + +INIT_MMX mmx +; pix_sum16_mmx(uint8_t * pix, int line_size) +cglobal pix_sum16, 2, 3 + movsxdifnidn r1, r1d + mov r2, r1 + neg r2 + shl r2, 4 + sub r0, r2 + pxor m7, m7 + pxor m6, m6 +.loop: + mova m0, [r0+r2+0] + mova m1, [r0+r2+0] + mova m2, [r0+r2+8] + mova m3, [r0+r2+8] + punpcklbw m0, m7 + punpckhbw m1, m7 + punpcklbw m2, m7 + punpckhbw m3, m7 + paddw m1, m0 + paddw m3, m2 + paddw m3, m1 + paddw m6, m3 + add r2, r1 + js .loop + mova m5, m6 + psrlq m6, 32 + paddw m6, m5 + mova m5, m6 + psrlq m6, 16 + paddw m6, m5 + movd eax, m6 + and eax, 0xffff + RET + +INIT_MMX mmx +; pix_norm1_mmx(uint8_t *pix, int line_size) +cglobal pix_norm1, 2, 4 + movsxdifnidn r1, r1d + mov r2, 16 + pxor m0, m0 + pxor m7, m7 +.loop: + mova m2, [r0+0] + mova m3, [r0+8] + mova m1, m2 + punpckhbw m1, m0 + punpcklbw m2, m0 + mova m4, m3 + punpckhbw m3, m0 + punpcklbw m4, m0 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + pmaddwd m4, m4 + paddd m2, m1 + paddd m4, m3 + paddd m7, m2 + add r0, r1 + paddd m7, m4 + dec r2 + jne .loop + mova m1, m7 + psrlq m7, 32 + paddd m1, m7 + movd eax, m1 + RET + diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c index 48fcce477d..8a50a26ce2 100644 --- a/libavcodec/x86/dsputilenc_mmx.c +++ b/libavcodec/x86/dsputilenc_mmx.c @@ -30,181 +30,14 @@ #include "libavcodec/mathops.h" #include "dsputil_mmx.h" +void ff_get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size); +void ff_get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size); +void ff_diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride); +int ff_pix_sum16_mmx(uint8_t * pix, int line_size); +int ff_pix_norm1_mmx(uint8_t *pix, int line_size); #if HAVE_INLINE_ASM -static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) -{ - __asm__ volatile( - "mov $-128, %%"REG_a" \n\t" - "pxor %%mm7, %%mm7 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%0), %%mm0 \n\t" - "movq (%0, %2), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "movq %%mm0, (%1, %%"REG_a") \n\t" - "movq %%mm1, 8(%1, %%"REG_a") \n\t" - "movq %%mm2, 16(%1, %%"REG_a") \n\t" - "movq %%mm3, 24(%1, %%"REG_a") \n\t" - "add %3, %0 \n\t" - "add $32, %%"REG_a" \n\t" - "js 1b \n\t" - : "+r" (pixels) - : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2) - : "%"REG_a - ); -} - -static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size) -{ - __asm__ volatile( - "pxor %%xmm4, %%xmm4 \n\t" - "movq (%0), %%xmm0 \n\t" - "movq (%0, %2), %%xmm1 \n\t" - "movq (%0, %2,2), %%xmm2 \n\t" - "movq (%0, %3), %%xmm3 \n\t" - "lea (%0,%2,4), %0 \n\t" - "punpcklbw %%xmm4, %%xmm0 \n\t" - "punpcklbw %%xmm4, %%xmm1 \n\t" - "punpcklbw %%xmm4, %%xmm2 \n\t" - "punpcklbw %%xmm4, %%xmm3 \n\t" - "movdqa %%xmm0, (%1) \n\t" - "movdqa %%xmm1, 16(%1) \n\t" - "movdqa %%xmm2, 32(%1) \n\t" - "movdqa %%xmm3, 48(%1) \n\t" - "movq (%0), %%xmm0 \n\t" - "movq (%0, %2), %%xmm1 \n\t" - "movq (%0, %2,2), %%xmm2 \n\t" - "movq (%0, %3), %%xmm3 \n\t" - "punpcklbw %%xmm4, %%xmm0 \n\t" - "punpcklbw %%xmm4, %%xmm1 \n\t" - "punpcklbw %%xmm4, %%xmm2 \n\t" - "punpcklbw %%xmm4, %%xmm3 \n\t" - "movdqa %%xmm0, 64(%1) \n\t" - "movdqa %%xmm1, 80(%1) \n\t" - "movdqa %%xmm2, 96(%1) \n\t" - "movdqa %%xmm3, 112(%1) \n\t" - : "+r" (pixels) - : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3) - ); -} - -static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) -{ - __asm__ volatile( - "pxor %%mm7, %%mm7 \n\t" - "mov $-128, %%"REG_a" \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%0), %%mm0 \n\t" - "movq (%1), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "psubw %%mm2, %%mm0 \n\t" - "psubw %%mm3, %%mm1 \n\t" - "movq %%mm0, (%2, %%"REG_a") \n\t" - "movq %%mm1, 8(%2, %%"REG_a") \n\t" - "add %3, %0 \n\t" - "add %3, %1 \n\t" - "add $16, %%"REG_a" \n\t" - "jnz 1b \n\t" - : "+r" (s1), "+r" (s2) - : "r" (block+64), "r" ((x86_reg)stride) - : "%"REG_a - ); -} - -static int pix_sum16_mmx(uint8_t * pix, int line_size){ - const int h=16; - int sum; - x86_reg index= -line_size*h; - - __asm__ volatile( - "pxor %%mm7, %%mm7 \n\t" - "pxor %%mm6, %%mm6 \n\t" - "1: \n\t" - "movq (%2, %1), %%mm0 \n\t" - "movq (%2, %1), %%mm1 \n\t" - "movq 8(%2, %1), %%mm2 \n\t" - "movq 8(%2, %1), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "paddw %%mm2, %%mm3 \n\t" - "paddw %%mm1, %%mm3 \n\t" - "paddw %%mm3, %%mm6 \n\t" - "add %3, %1 \n\t" - " js 1b \n\t" - "movq %%mm6, %%mm5 \n\t" - "psrlq $32, %%mm6 \n\t" - "paddw %%mm5, %%mm6 \n\t" - "movq %%mm6, %%mm5 \n\t" - "psrlq $16, %%mm6 \n\t" - "paddw %%mm5, %%mm6 \n\t" - "movd %%mm6, %0 \n\t" - "andl $0xFFFF, %0 \n\t" - : "=&r" (sum), "+r" (index) - : "r" (pix - index), "r" ((x86_reg)line_size) - ); - - return sum; -} - -static int pix_norm1_mmx(uint8_t *pix, int line_size) { - int tmp; - __asm__ volatile ( - "movl $16,%%ecx\n" - "pxor %%mm0,%%mm0\n" - "pxor %%mm7,%%mm7\n" - "1:\n" - "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */ - "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */ - - "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */ - - "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */ - "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */ - - "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */ - "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */ - "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */ - - "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ - "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ - - "pmaddwd %%mm3,%%mm3\n" - "pmaddwd %%mm4,%%mm4\n" - - "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, - pix2^2+pix3^2+pix6^2+pix7^2) */ - "paddd %%mm3,%%mm4\n" - "paddd %%mm2,%%mm7\n" - - "add %2, %0\n" - "paddd %%mm4,%%mm7\n" - "dec %%ecx\n" - "jnz 1b\n" - - "movq %%mm7,%%mm1\n" - "psrlq $32, %%mm7\n" /* shift hi dword to lo */ - "paddd %%mm7,%%mm1\n" - "movd %%mm1,%1\n" - : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" ); - return tmp; -} - static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { int tmp; __asm__ volatile ( @@ -1112,10 +945,23 @@ hadamard_func(ssse3) void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) { int mm_flags = av_get_cpu_flags(); - -#if HAVE_INLINE_ASM int bit_depth = avctx->bits_per_raw_sample; +#if HAVE_YASM + if (EXTERNAL_MMX(mm_flags)) { + if (bit_depth <= 8) + c->get_pixels = ff_get_pixels_mmx; + c->diff_pixels = ff_diff_pixels_mmx; + c->pix_sum = ff_pix_sum16_mmx; + + c->pix_norm1 = ff_pix_norm1_mmx; + } + if (EXTERNAL_SSE2(mm_flags)) + if (bit_depth <= 8) + c->get_pixels = ff_get_pixels_sse2; +#endif /* HAVE_YASM */ + +#if HAVE_INLINE_ASM if (mm_flags & AV_CPU_FLAG_MMX) { const int dct_algo = avctx->dct_algo; if (avctx->bits_per_raw_sample <= 8 && @@ -1129,15 +975,10 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) } } - if (bit_depth <= 8) - c->get_pixels = get_pixels_mmx; - c->diff_pixels = diff_pixels_mmx; - c->pix_sum = pix_sum16_mmx; c->diff_bytes= diff_bytes_mmx; c->sum_abs_dctelem= sum_abs_dctelem_mmx; - c->pix_norm1 = pix_norm1_mmx; c->sse[0] = sse16_mmx; c->sse[1] = sse8_mmx; c->vsad[4]= vsad_intra16_mmx; @@ -1167,8 +1008,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) } if(mm_flags & AV_CPU_FLAG_SSE2){ - if (bit_depth <= 8) - c->get_pixels = get_pixels_sse2; c->sum_abs_dctelem= sum_abs_dctelem_sse2; } |