diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2007-08-26 01:11:02 +0000 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2007-08-26 01:11:02 +0000 |
commit | 629750290f6122a72e68c34cf94f521a90def2ef (patch) | |
tree | 275364443b6b114e745eb4a11c54d5e424680d44 /libavcodec/i386/snowdsp_mmx.c | |
parent | b696a4c91d5f2b30310eb762beeb225292d94b44 (diff) | |
download | ffmpeg-629750290f6122a72e68c34cf94f521a90def2ef.tar.gz |
avoid overflow in the 3rd lifting step, this now needs mmx2 at minimum
(patch for plain mmx support is welcome ...)
Originally committed as revision 10226 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/i386/snowdsp_mmx.c')
-rw-r--r-- | libavcodec/i386/snowdsp_mmx.c | 105 |
1 files changed, 75 insertions, 30 deletions
diff --git a/libavcodec/i386/snowdsp_mmx.c b/libavcodec/i386/snowdsp_mmx.c index 6deaad2336..e7f4b2953b 100644 --- a/libavcodec/i386/snowdsp_mmx.c +++ b/libavcodec/i386/snowdsp_mmx.c @@ -111,22 +111,29 @@ void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width){ i = 0; asm volatile( - "psllw $2, %%xmm7 \n\t" + "psllw $13, %%xmm7 \n\t" + "pcmpeqw %%xmm6, %%xmm6 \n\t" + "psrlw $13, %%xmm6 \n\t" + "paddw %%xmm7, %%xmm6 \n\t" ::); for(; i<w_l-15; i+=16){ asm volatile( - "movdqu (%1), %%xmm1 \n\t" - "movdqu 16(%1), %%xmm5 \n\t" - "movdqu 2(%1), %%xmm0 \n\t" - "movdqu 18(%1), %%xmm4 \n\t" //FIXME try aligned reads and shifts - "paddw %%xmm1, %%xmm0 \n\t" - "paddw %%xmm5, %%xmm4 \n\t" - "paddw %%xmm7, %%xmm0 \n\t" - "paddw %%xmm7, %%xmm4 \n\t" + "movdqu (%1), %%xmm0 \n\t" + "movdqu 16(%1), %%xmm4 \n\t" + "movdqu 2(%1), %%xmm1 \n\t" + "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts + "paddw %%xmm6, %%xmm0 \n\t" + "paddw %%xmm6, %%xmm4 \n\t" + "paddw %%xmm7, %%xmm1 \n\t" + "paddw %%xmm7, %%xmm5 \n\t" + "pavgw %%xmm1, %%xmm0 \n\t" + "pavgw %%xmm5, %%xmm4 \n\t" + "psubw %%xmm7, %%xmm0 \n\t" + "psubw %%xmm7, %%xmm4 \n\t" + "psraw $1, %%xmm0 \n\t" + "psraw $1, %%xmm4 \n\t" "movdqa (%0), %%xmm1 \n\t" "movdqa 16(%0), %%xmm5 \n\t" - "psraw $2, %%xmm0 \n\t" - "psraw $2, %%xmm4 \n\t" "paddw %%xmm1, %%xmm0 \n\t" "paddw %%xmm5, %%xmm4 \n\t" "psraw $2, %%xmm0 \n\t" @@ -288,18 +295,27 @@ void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width){ i = 1; b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS); asm volatile( - "psllw $2, %%mm7 \n\t" + "psllw $13, %%mm7 \n\t" + "pcmpeqw %%mm6, %%mm6 \n\t" + "psrlw $13, %%mm6 \n\t" + "paddw %%mm7, %%mm6 \n\t" ::); for(; i<w_l-7; i+=8){ asm volatile( "movq (%1), %%mm0 \n\t" "movq 8(%1), %%mm4 \n\t" - "paddw 2(%1), %%mm0 \n\t" - "paddw 10(%1), %%mm4 \n\t" - "paddw %%mm7, %%mm0 \n\t" - "paddw %%mm7, %%mm4 \n\t" - "psraw $2, %%mm0 \n\t" - "psraw $2, %%mm4 \n\t" + "movq 2(%1), %%mm1 \n\t" + "movq 10(%1), %%mm5 \n\t" + "paddw %%mm6, %%mm0 \n\t" + "paddw %%mm6, %%mm4 \n\t" + "paddw %%mm7, %%mm1 \n\t" + "paddw %%mm7, %%mm5 \n\t" + "pavgw %%mm1, %%mm0 \n\t" + "pavgw %%mm5, %%mm4 \n\t" + "psubw %%mm7, %%mm0 \n\t" + "psubw %%mm7, %%mm4 \n\t" + "psraw $1, %%mm0 \n\t" + "psraw $1, %%mm4 \n\t" "movq (%0), %%mm1 \n\t" "movq 8(%0), %%mm5 \n\t" "paddw %%mm1, %%mm0 \n\t" @@ -467,16 +483,31 @@ void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, snow_vertical_compose_sse2_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") snow_vertical_compose_sse2_store(REG_S,"xmm0","xmm2","xmm4","xmm6") "mov %2, %%"REG_a" \n\t" - snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_sra("2","xmm0","xmm2","xmm4","xmm6") + + "pcmpeqw %%xmm7, %%xmm7 \n\t" + "pcmpeqw %%xmm5, %%xmm5 \n\t" + "psllw $15, %%xmm7 \n\t" + "psrlw $13, %%xmm5 \n\t" + "paddw %%xmm7, %%xmm5 \n\t" + snow_vertical_compose_sse2_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6") + "movq (%%"REG_a",%%"REG_d",2), %%xmm1 \n\t" + "movq 8(%%"REG_a",%%"REG_d",2), %%xmm3 \n\t" + "paddw %%xmm7, %%xmm1 \n\t" + "paddw %%xmm7, %%xmm3 \n\t" + "pavgw %%xmm1, %%xmm0 \n\t" + "pavgw %%xmm3, %%xmm2 \n\t" + "movq 16(%%"REG_a",%%"REG_d",2), %%xmm1 \n\t" + "movq 24(%%"REG_a",%%"REG_d",2), %%xmm3 \n\t" + "paddw %%xmm7, %%xmm1 \n\t" + "paddw %%xmm7, %%xmm3 \n\t" + "pavgw %%xmm1, %%xmm4 \n\t" + "pavgw %%xmm3, %%xmm6 \n\t" + snow_vertical_compose_sse2_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_sra("1","xmm0","xmm2","xmm4","xmm6") snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6") - "pcmpeqd %%xmm1, %%xmm1 \n\t" - "psllw $15, %%xmm1 \n\t" - "psrlw $14, %%xmm1 \n\t" "mov %1, %%"REG_S" \n\t" - snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6") snow_vertical_compose_sse2_sra("2","xmm0","xmm2","xmm4","xmm6") snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6") snow_vertical_compose_sse2_store(REG_c,"xmm0","xmm2","xmm4","xmm6") @@ -569,16 +600,30 @@ void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, I snow_vertical_compose_mmx_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") snow_vertical_compose_mmx_store(REG_S,"mm0","mm2","mm4","mm6") "mov %2, %%"REG_a" \n\t" - snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_sra("2","mm0","mm2","mm4","mm6") + "pcmpeqw %%mm7, %%mm7 \n\t" + "pcmpeqw %%mm5, %%mm5 \n\t" + "psllw $15, %%mm7 \n\t" + "psrlw $13, %%mm5 \n\t" + "paddw %%mm7, %%mm5 \n\t" + snow_vertical_compose_mmx_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6") + "movq (%%"REG_a",%%"REG_d",2), %%mm1 \n\t" + "movq 8(%%"REG_a",%%"REG_d",2), %%mm3 \n\t" + "paddw %%mm7, %%mm1 \n\t" + "paddw %%mm7, %%mm3 \n\t" + "pavgw %%mm1, %%mm0 \n\t" + "pavgw %%mm3, %%mm2 \n\t" + "movq 16(%%"REG_a",%%"REG_d",2), %%mm1 \n\t" + "movq 24(%%"REG_a",%%"REG_d",2), %%mm3 \n\t" + "paddw %%mm7, %%mm1 \n\t" + "paddw %%mm7, %%mm3 \n\t" + "pavgw %%mm1, %%mm4 \n\t" + "pavgw %%mm3, %%mm6 \n\t" + snow_vertical_compose_sse2_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_sra("1","mm0","mm2","mm4","mm6") snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6") - "pcmpeqw %%mm1, %%mm1 \n\t" - "psllw $15, %%mm1 \n\t" - "psrlw $14, %%mm1 \n\t" "mov %1, %%"REG_S" \n\t" - snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6") snow_vertical_compose_mmx_sra("2","mm0","mm2","mm4","mm6") snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6") snow_vertical_compose_mmx_store(REG_c,"mm0","mm2","mm4","mm6") |