diff options
author | Nick Kurshev <nickols_k@mail.ru> | 2001-07-30 09:04:34 +0000 |
---|---|---|
committer | Nick Kurshev <nickols_k@mail.ru> | 2001-07-30 09:04:34 +0000 |
commit | a9b3f63084e59715da3a1dd5b365a2638aa54a1e (patch) | |
tree | 8ef3b4c9b514609ef68105dfcdf0e0ad0d519791 /libavcodec/i386 | |
parent | 3bb4e23a8aa15fc93b91b92d8c6437358fd71113 (diff) | |
download | ffmpeg-a9b3f63084e59715da3a1dd5b365a2638aa54a1e.tar.gz |
Sync with mplayer's stuff
Originally committed as revision 14 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/i386')
-rw-r--r-- | libavcodec/i386/dsputil_mmx.c | 32 | ||||
-rw-r--r-- | libavcodec/i386/dsputil_mmx_avg.h | 2 | ||||
-rw-r--r-- | libavcodec/i386/mpegvideo.c | 239 |
3 files changed, 257 insertions, 16 deletions
diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c index 18cb03e0e8..d4a07c3a77 100644 --- a/libavcodec/i386/dsputil_mmx.c +++ b/libavcodec/i386/dsputil_mmx.c @@ -30,8 +30,10 @@ int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); /* pixel operations */ -static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; -static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; +static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001; +static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002; +//static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; +//static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; /***********************************/ /* 3Dnow specific */ @@ -215,7 +217,7 @@ static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, __asm __volatile( "pxor %%mm7, %%mm7\n\t" "movq %0, %%mm4\n\t" - ::"m"(mm_wone[0]):"memory"); + ::"m"(mm_wone):"memory"); do { __asm __volatile( "movq %1, %%mm0\n\t" @@ -250,7 +252,7 @@ static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, __asm __volatile( "pxor %%mm7, %%mm7\n\t" "movq %0, %%mm4\n\t" - ::"m"(mm_wone[0]):"memory"); + ::"m"(mm_wone):"memory"); do { __asm __volatile( "movq %1, %%mm0\n\t" @@ -287,7 +289,7 @@ static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, __asm __volatile( "pxor %%mm7, %%mm7\n\t" "movq %0, %%mm6\n\t" - ::"m"(mm_wtwo[0]):"memory"); + ::"m"(mm_wtwo):"memory"); do { __asm __volatile( "movq %1, %%mm0\n\t" @@ -399,7 +401,7 @@ static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int __asm __volatile( "pxor %%mm7, %%mm7\n\t" "movq %0, %%mm6\n\t" - ::"m"(mm_wone[0]):"memory"); + ::"m"(mm_wone):"memory"); do { __asm __volatile( "movq %1, %%mm0\n\t" @@ -448,7 +450,7 @@ static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int __asm __volatile( "pxor %%mm7, %%mm7\n\t" "movq %0, %%mm6\n\t" - ::"m"(mm_wone[0]):"memory"); + ::"m"(mm_wone):"memory"); do { __asm __volatile( "movq %0, %%mm0\n\t" @@ -485,7 +487,7 @@ static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_si __asm __volatile( "pxor %%mm7, %%mm7\n\t" "movq %0, %%mm6\n\t" - ::"m"(mm_wone[0]):"memory"); + ::"m"(mm_wone):"memory"); do { __asm __volatile( "movq %1, %%mm1\n\t" @@ -531,7 +533,7 @@ static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_si __asm __volatile( "pxor %%mm7, %%mm7\n\t" "movq %0, %%mm6\n\t" - ::"m"(mm_wone[0]):"memory"); + ::"m"(mm_wone):"memory"); do { __asm __volatile( "movq %1, %%mm1\n\t" @@ -577,7 +579,7 @@ static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_s __asm __volatile( "pxor %%mm7, %%mm7\n\t" "movq %0, %%mm6\n\t" - ::"m"(mm_wtwo[0]):"memory"); + ::"m"(mm_wtwo):"memory"); do { __asm __volatile( "movq %1, %%mm0\n\t" @@ -621,7 +623,7 @@ static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_s "movq %%mm0, %0\n\t" :"=m"(*p) :"m"(*pix), - "m"(*(pix+line_size)), "m"(mm_wone[0]) + "m"(*(pix+line_size)), "m"(mm_wone) :"memory"); pix += line_size; p += line_size ; @@ -748,7 +750,7 @@ static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int __asm __volatile( "pxor %%mm7, %%mm7\n\t" "movq %0, %%mm6\n\t" - ::"m"(mm_wone[0]):"memory"); + ::"m"(mm_wone):"memory"); do { __asm __volatile( "movq %1, %%mm0\n\t" @@ -832,7 +834,7 @@ static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_si __asm __volatile( "pxor %%mm7, %%mm7\n\t" "movq %0, %%mm6" - ::"m"(mm_wone[0]):"memory"); + ::"m"(mm_wone):"memory"); do { __asm __volatile( "movq %0, %%mm0\n\t" @@ -872,7 +874,7 @@ static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_si __asm __volatile( "pxor %%mm7, %%mm7\n\t" "movq %0, %%mm6" - ::"m"(mm_wone[0]):"memory"); + ::"m"(mm_wone):"memory"); do { __asm __volatile( "movq %0, %%mm0\n\t" @@ -912,7 +914,7 @@ static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line __asm __volatile( "pxor %%mm7, %%mm7\n\t" "movq %0, %%mm6\n\t" - ::"m"(mm_wtwo[0]):"memory"); + ::"m"(mm_wtwo):"memory"); do { __asm __volatile( "movq %1, %%mm0\n\t" diff --git a/libavcodec/i386/dsputil_mmx_avg.h b/libavcodec/i386/dsputil_mmx_avg.h index 99e806a714..5cd640f713 100644 --- a/libavcodec/i386/dsputil_mmx_avg.h +++ b/libavcodec/i386/dsputil_mmx_avg.h @@ -243,7 +243,7 @@ static void DEF(avg_pixels_xy2)( UINT8 *block, const UINT8 *pixels, int line_si __asm __volatile( "pxor %%mm7, %%mm7\n\t" "movq %0, %%mm6\n\t" - ::"m"(mm_wtwo[0]):"memory"); + ::"m"(mm_wtwo):"memory"); do { __asm __volatile( "movq %1, %%mm0\n\t" diff --git a/libavcodec/i386/mpegvideo.c b/libavcodec/i386/mpegvideo.c new file mode 100644 index 0000000000..d9e5e5c114 --- /dev/null +++ b/libavcodec/i386/mpegvideo.c @@ -0,0 +1,239 @@ +/* + * The simplest mpeg encoder (well, it was the simplest!) + * Copyright (c) 2000,2001 Gerard Lantau. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Optimized for ia32 cpus by Nick Kurshev <nickols_k@mail.ru> + */ + +void MPV_frame_start(MpegEncContext *s) +{ + if (s->pict_type == B_TYPE) { + __asm __volatile( + "movl (%1), %%eax\n\t" + "movl 4(%1), %%edx\n\t" + "movl 8(%1), %%ecx\n\t" + "movl %%eax, (%0)\n\t" + "movl %%edx, 4(%0)\n\t" + "movl %%ecx, 8(%0)\n\t" + : + :"r"(s->current_picture), "r"(s->aux_picture) + :"eax","edx","ecx","memory"); + } else { + /* swap next and last */ + __asm __volatile( + "movl (%1), %%eax\n\t" + "movl 4(%1), %%edx\n\t" + "movl 8(%1), %%ecx\n\t" + "xchgl (%0), %%eax\n\t" + "xchgl 4(%0), %%edx\n\t" + "xchgl 8(%0), %%ecx\n\t" + "movl %%eax, (%1)\n\t" + "movl %%edx, 4(%1)\n\t" + "movl %%ecx, 8(%1)\n\t" + "movl %%eax, (%2)\n\t" + "movl %%edx, 4(%2)\n\t" + "movl %%ecx, 8(%2)\n\t" + : + :"r"(s->last_picture), "r"(s->next_picture), "r"(s->current_picture) + :"eax","edx","ecx","memory"); + } +} + +static void dct_unquantize(MpegEncContext *s, DCTELEM *block, int n, int qscale); + +#ifdef HAVE_MMX +static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL; +static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; + +/* + NK: + Note: looking at PARANOID: + "enable all paranoid tests for rounding, overflows, etc..." + +#ifdef PARANOID + if (level < -2048 || level > 2047) + fprintf(stderr, "unquant error %d %d\n", i, level); +#endif + We can suppose that result of two multiplications can't be greate of 0xFFFF + i.e. is 16-bit, so we use here only PMULLW instruction and can avoid + a complex multiplication. +===================================================== + Full formula for multiplication of 2 integer numbers + which are represent as high:low words: + input: value1 = high1:low1 + value2 = high2:low2 + output: value3 = value1*value2 + value3=high3:low3 (on overflow: modulus 2^32 wrap-around) + this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4 + but this algorithm will compute only 0x66cb0ce4 + this limited by 16-bit size of operands + --------------------------------- + tlow1 = high1*low2 + tlow2 = high2*low1 + tlow1 = tlow1 + tlow2 + high3:low3 = low1*low2 + high3 += tlow1 +*/ +#ifdef BIN_PORTABILITY +static void dct_unquantize_mmx +#else +#define HAVE_DCT_UNQUANTIZE 1 +static void dct_unquantize +#endif +(MpegEncContext *s,DCTELEM *block, int n, int qscale) +{ + int i, level; + const UINT16 *quant_matrix; + if (s->mb_intra) { + if (n < 4) + block[0] = block[0] * s->y_dc_scale; + else + block[0] = block[0] * s->c_dc_scale; + if (s->out_format == FMT_H263) { + i = 1; + goto unquant_even; + } + /* XXX: only mpeg1 */ + quant_matrix = s->intra_matrix; + i=1; + /* Align on 4 elements boundary */ + while(i&3) + { + level = block[i]; + if (level) { + if (level < 0) level = -level; + level = (int)(level * qscale * quant_matrix[i]) >> 3; + level = (level - 1) | 1; + if (block[i] < 0) level = -level; + block[i] = level; + } + i++; + } + __asm __volatile( + "movd %0, %%mm6\n\t" /* mm6 = qscale | 0 */ + "punpckldq %%mm6, %%mm6\n\t" /* mm6 = qscale | qscale */ + "movq %2, %%mm4\n\t" + "movq %%mm6, %%mm7\n\t" + "movq %1, %%mm5\n\t" + "packssdw %%mm6, %%mm7\n\t" /* mm7 = qscale | qscale | qscale | qscale */ + "pxor %%mm6, %%mm6\n\t" + ::"g"(qscale),"m"(mm_wone),"m"(mm_wabs):"memory"); + for(;i<64;i+=4) { + __asm __volatile( + "movq %1, %%mm0\n\t" + "movq %%mm7, %%mm1\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm0, %%mm3\n\t" + "pcmpgtw %%mm6, %%mm2\n\t" + "pmullw %2, %%mm1\n\t" + "pandn %%mm4, %%mm2\n\t" + "por %%mm5, %%mm2\n\t" + "pmullw %%mm2, %%mm0\n\t" /* mm0 = abs(block[i]). */ + + "pcmpeqw %%mm6, %%mm3\n\t" + "pmullw %%mm0, %%mm1\n\t" + "psraw $3, %%mm1\n\t" + "psubw %%mm5, %%mm1\n\t" /* block[i] --; */ + "pandn %%mm4, %%mm3\n\t" /* fake of pcmpneqw : mm0 != 0 then mm1 = -1 */ + "por %%mm5, %%mm1\n\t" /* block[i] |= 1 */ + "pmullw %%mm2, %%mm1\n\t" /* change signs again */ + + "pand %%mm3, %%mm1\n\t" /* nullify if was zero */ + "movq %%mm1, %0" + :"=m"(block[i]) + :"m"(block[i]), "m"(quant_matrix[i]) + :"memory"); + } + } else { + i = 0; + unquant_even: + quant_matrix = s->non_intra_matrix; + /* Align on 4 elements boundary */ + while(i&3) + { + level = block[i]; + if (level) { + if (level < 0) level = -level; + level = (((level << 1) + 1) * qscale * + ((int) quant_matrix[i])) >> 4; + level = (level - 1) | 1; + if(block[i] < 0) level = -level; + block[i] = level; + } + i++; + } + __asm __volatile( + "movd %0, %%mm6\n\t" /* mm6 = qscale | 0 */ + "punpckldq %%mm6, %%mm6\n\t" /* mm6 = qscale | qscale */ + "movq %2, %%mm4\n\t" + "movq %%mm6, %%mm7\n\t" + "movq %1, %%mm5\n\t" + "packssdw %%mm6, %%mm7\n\t" /* mm7 = qscale | qscale | qscale | qscale */ + "pxor %%mm6, %%mm6\n\t" + ::"g"(qscale),"m"(mm_wone),"m"(mm_wabs):"memory"); + for(;i<64;i+=4) { + __asm __volatile( + "movq %1, %%mm0\n\t" + "movq %%mm7, %%mm1\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm0, %%mm3\n\t" + "pcmpgtw %%mm6, %%mm2\n\t" + "pmullw %2, %%mm1\n\t" + "pandn %%mm4, %%mm2\n\t" + "por %%mm5, %%mm2\n\t" + "pmullw %%mm2, %%mm0\n\t" /* mm0 = abs(block[i]). */ + "psllw $1, %%mm0\n\t" /* block[i] <<= 1 */ + "paddw %%mm5, %%mm0\n\t" /* block[i] ++ */ + + "pmullw %%mm0, %%mm1\n\t" + "psraw $4, %%mm1\n\t" + "pcmpeqw %%mm6, %%mm3\n\t" + "psubw %%mm5, %%mm1\n\t" /* block[i] --; */ + "pandn %%mm4, %%mm3\n\t" /* fake of pcmpneqw : mm0 != 0 then mm1 = -1 */ + "por %%mm5, %%mm1\n\t" /* block[i] |= 1 */ + "pmullw %%mm2, %%mm1\n\t" /* change signs again */ + + "pand %%mm3, %%mm1\n\t" /* nullify if was zero */ + "movq %%mm1, %0" + :"=m"(block[i]) + :"m"(block[i]), "m"(quant_matrix[i]) + :"memory"); + } + } +} + +#ifdef BIN_PORTABILITY +static void (*dct_unquantize_ptr)(MpegEncContext *s, + DCTELEM *block, int n, int qscale); + +void MPV_common_init_mmx(void) +{ + int mm_flags; + mm_flags = mm_support(); + if (mm_flags & MM_MMX) { + dct_unquantize_ptr = dct_unquantize_mmx; + } + else { + dct_unquantize_ptr = dct_unquantize; + } +} + +#define DCT_UNQUANTIZE(a,b,c,d) (*dct_unquantize_ptr)(a,b,c,d) +#else +#define DCT_UNQUANTIZE(a,b,c,d) dct_unquantize(a,b,c,d) +#endif /* BIN_PORTABILITY */ +#endif /* HAVE_MMX */ |