aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/i386
diff options
context:
space:
mode:
authorLoren Merritt <lorenm@u.washington.edu>2006-08-03 03:18:47 +0000
committerLoren Merritt <lorenm@u.washington.edu>2006-08-03 03:18:47 +0000
commit2dac4acfc0f2abbe28082cdb5c3ed775a78d2867 (patch)
treeae3bf6a7ddd9bb5bf29a305eef842488629965d9 /libavcodec/i386
parent7bf0049623652b92a566999d37f0b481c2056d6e (diff)
downloadffmpeg-2dac4acfc0f2abbe28082cdb5c3ed775a78d2867.tar.gz
sse & sse2 implementations of vorbis channel coupling.
9% faster vorbis (on a K8). Originally committed as revision 5898 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/i386')
-rw-r--r--libavcodec/i386/dsputil_mmx.c58
1 files changed, 58 insertions, 0 deletions
diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c
index ec6b2ad1a7..afcb02e4db 100644
--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -2711,6 +2711,59 @@ static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
}
#endif
+static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
+{
+ int i;
+ asm volatile("pxor %%mm7, %%mm7":);
+ for(i=0; i<blocksize; i+=2) {
+ asm volatile(
+ "movq %0, %%mm0 \n\t"
+ "movq %1, %%mm1 \n\t"
+ "movq %%mm0, %%mm2 \n\t"
+ "movq %%mm1, %%mm3 \n\t"
+ "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
+ "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
+ "pslld $31, %%mm2 \n\t" // keep only the sign bit
+ "pxor %%mm2, %%mm1 \n\t"
+ "movq %%mm3, %%mm4 \n\t"
+ "pand %%mm1, %%mm3 \n\t"
+ "pandn %%mm1, %%mm4 \n\t"
+ "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
+ "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
+ "movq %%mm3, %1 \n\t"
+ "movq %%mm0, %0 \n\t"
+ :"+m"(mag[i]), "+m"(ang[i])
+ ::"memory"
+ );
+ }
+ asm volatile("emms");
+}
+static void vorbis_inverse_coupling_sse2(float *mag, float *ang, int blocksize)
+{
+ int i;
+ for(i=0; i<blocksize; i+=4) {
+ asm volatile(
+ "movaps %0, %%xmm0 \n\t"
+ "movaps %1, %%xmm1 \n\t"
+ "pxor %%xmm2, %%xmm2 \n\t"
+ "pxor %%xmm3, %%xmm3 \n\t"
+ "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
+ "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
+ "pslld $31, %%xmm2 \n\t" // keep only the sign bit
+ "pxor %%xmm2, %%xmm1 \n\t"
+ "movaps %%xmm3, %%xmm4 \n\t"
+ "pand %%xmm1, %%xmm3 \n\t"
+ "pandn %%xmm1, %%xmm4 \n\t"
+ "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
+ "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
+ "movaps %%xmm3, %1 \n\t"
+ "movaps %%xmm0, %0 \n\t"
+ :"+m"(mag[i]), "+m"(ang[i])
+ ::"memory"
+ );
+ }
+}
+
#ifdef CONFIG_SNOW_ENCODER
extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
@@ -3137,6 +3190,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
}
#endif
+
+ if(mm_flags & MM_SSE2)
+ c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse2;
+ else if(mm_flags & MM_SSE)
+ c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
}
#ifdef CONFIG_ENCODERS