aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/dsputil_mmx.c
diff options
context:
space:
mode:
authorLoren Merritt <lorenm@u.washington.edu>2009-12-03 18:53:12 +0000
committerLoren Merritt <lorenm@u.washington.edu>2009-12-03 18:53:12 +0000
commitb10fa1bb8b9d4074b1d4dd7364ca236a574c9951 (patch)
treeb7f75f84fb13d8a68ee8079da1f03089cc33b9c5 /libavcodec/x86/dsputil_mmx.c
parentf415be684dda3958332d19e0854c080378eef4c2 (diff)
downloadffmpeg-b10fa1bb8b9d4074b1d4dd7364ca236a574c9951.tar.gz
port ape dsp functions from sse2 to mmx
now requires yasm Originally committed as revision 20722 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/x86/dsputil_mmx.c')
-rw-r--r--libavcodec/x86/dsputil_mmx.c93
1 files changed, 18 insertions, 75 deletions
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index af33707df0..93d4af5565 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2384,6 +2384,12 @@ static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
+void ff_add_int16_mmx2(int16_t * v1, int16_t * v2, int order);
+void ff_add_int16_sse2(int16_t * v1, int16_t * v2, int order);
+void ff_sub_int16_mmx2(int16_t * v1, int16_t * v2, int order);
+void ff_sub_int16_sse2(int16_t * v1, int16_t * v2, int order);
+int32_t ff_scalarproduct_int16_mmx2(int16_t * v1, int16_t * v2, int order, int shift);
+int32_t ff_scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift);
void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
@@ -2507,78 +2513,6 @@ void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, ui
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
-static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
-{
- x86_reg o = -(order << 1);
- v1 += order;
- v2 += order;
- __asm__ volatile(
- "1: \n\t"
- "movdqu (%1,%2), %%xmm0 \n\t"
- "movdqu 16(%1,%2), %%xmm1 \n\t"
- "paddw (%0,%2), %%xmm0 \n\t"
- "paddw 16(%0,%2), %%xmm1 \n\t"
- "movdqa %%xmm0, (%0,%2) \n\t"
- "movdqa %%xmm1, 16(%0,%2) \n\t"
- "add $32, %2 \n\t"
- "js 1b \n\t"
- : "+r"(v1), "+r"(v2), "+r"(o)
- );
-}
-
-static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order)
-{
- x86_reg o = -(order << 1);
- v1 += order;
- v2 += order;
- __asm__ volatile(
- "1: \n\t"
- "movdqa (%0,%2), %%xmm0 \n\t"
- "movdqa 16(%0,%2), %%xmm2 \n\t"
- "movdqu (%1,%2), %%xmm1 \n\t"
- "movdqu 16(%1,%2), %%xmm3 \n\t"
- "psubw %%xmm1, %%xmm0 \n\t"
- "psubw %%xmm3, %%xmm2 \n\t"
- "movdqa %%xmm0, (%0,%2) \n\t"
- "movdqa %%xmm2, 16(%0,%2) \n\t"
- "add $32, %2 \n\t"
- "js 1b \n\t"
- : "+r"(v1), "+r"(v2), "+r"(o)
- );
-}
-
-static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
-{
- int res = 0;
- DECLARE_ALIGNED_16(xmm_reg, sh);
- x86_reg o = -(order << 1);
-
- v1 += order;
- v2 += order;
- sh.a = shift;
- __asm__ volatile(
- "pxor %%xmm7, %%xmm7 \n\t"
- "1: \n\t"
- "movdqu (%0,%3), %%xmm0 \n\t"
- "movdqu 16(%0,%3), %%xmm1 \n\t"
- "pmaddwd (%1,%3), %%xmm0 \n\t"
- "pmaddwd 16(%1,%3), %%xmm1 \n\t"
- "paddd %%xmm0, %%xmm7 \n\t"
- "paddd %%xmm1, %%xmm7 \n\t"
- "add $32, %3 \n\t"
- "js 1b \n\t"
- "movhlps %%xmm7, %%xmm2 \n\t"
- "paddd %%xmm2, %%xmm7 \n\t"
- "psrad %4, %%xmm7 \n\t"
- "pshuflw $0x4E, %%xmm7,%%xmm2 \n\t"
- "paddd %%xmm2, %%xmm7 \n\t"
- "movd %%xmm7, %2 \n\t"
- : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o)
- : "m"(sh)
- );
- return res;
-}
-
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
{
mm_flags = mm_support();
@@ -3015,6 +2949,13 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
}
}
+ if(mm_flags & FF_MM_MMX2){
+#if HAVE_YASM
+ c->add_int16 = ff_add_int16_mmx2;
+ c->sub_int16 = ff_sub_int16_mmx2;
+ c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
+#endif
+ }
if(mm_flags & FF_MM_SSE){
c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
c->ac3_downmix = ac3_downmix_sse;
@@ -3033,9 +2974,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
c->float_to_int16 = float_to_int16_sse2;
c->float_to_int16_interleave = float_to_int16_interleave_sse2;
- c->add_int16 = add_int16_sse2;
- c->sub_int16 = sub_int16_sse2;
- c->scalarproduct_int16 = scalarproduct_int16_sse2;
+#if HAVE_YASM
+ c->add_int16 = ff_add_int16_sse2;
+ c->sub_int16 = ff_sub_int16_sse2;
+ c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
+#endif
}
}