aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86
diff options
context:
space:
mode:
authorChristophe Gisquet <christophe.gisquet@gmail.com>2014-05-22 17:48:18 +0000
committerMichael Niedermayer <michaelni@gmx.at>2014-05-22 20:17:49 +0200
commitc081ca851c9b84a41fafbce22c47a9b30b7bf8ad (patch)
treeeb7ad0723126fb7834ec4814e6bc917fc9ec65a7 /libavcodec/x86
parent17ac9980555a0afc5b4633a538c5170c8ceacc04 (diff)
downloadffmpeg-c081ca851c9b84a41fafbce22c47a9b30b7bf8ad.tar.gz
x86: hpeldsp: avg_pixels_xy2 for mmx2&3dnow
This is a port of the inline assembly of the mmx version to use the pavg(us|)b instruction. 8 16 mmx 1498 4355 mmx2 1242 3509 Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/hpeldsp.asm77
-rw-r--r--libavcodec/x86/hpeldsp_init.c9
2 files changed, 86 insertions, 0 deletions
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 937b59bf4f..2adead218c 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -29,6 +29,7 @@
SECTION_RODATA
cextern pb_1
+cextern pw_2
SECTION_TEXT
@@ -494,3 +495,79 @@ INIT_MMX mmxext
AVG_APPROX_PIXELS8_XY2
INIT_MMX 3dnow
AVG_APPROX_PIXELS8_XY2
+
+
+; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro AVG_PIXELS_XY2 0
+%if cpuflag(sse2)
+cglobal avg_pixels16_xy2, 4,5,8
+%else
+cglobal avg_pixels8_xy2, 4,5
+%endif
+ pxor m7, m7
+ mova m6, [pw_2]
+ movu m0, [r1]
+ movu m4, [r1+1]
+ mova m1, m0
+ mova m5, m4
+ punpcklbw m0, m7
+ punpcklbw m4, m7
+ punpckhbw m1, m7
+ punpckhbw m5, m7
+ paddusw m4, m0
+ paddusw m5, m1
+ xor r4, r4
+ add r1, r2
+.loop:
+ movu m0, [r1+r4]
+ movu m2, [r1+r4+1]
+ mova m1, m0
+ mova m3, m2
+ punpcklbw m0, m7
+ punpcklbw m2, m7
+ punpckhbw m1, m7
+ punpckhbw m3, m7
+ paddusw m0, m2
+ paddusw m1, m3
+ paddusw m4, m6
+ paddusw m5, m6
+ paddusw m4, m0
+ paddusw m5, m1
+ psrlw m4, 2
+ psrlw m5, 2
+ mova m3, [r0+r4]
+ packuswb m4, m5
+ PAVGB m4, m3
+ mova [r0+r4], m4
+ add r4, r2
+
+ movu m2, [r1+r4]
+ movu m4, [r1+r4+1]
+ mova m3, m2
+ mova m5, m4
+ punpcklbw m2, m7
+ punpcklbw m4, m7
+ punpckhbw m3, m7
+ punpckhbw m5, m7
+ paddusw m4, m2
+ paddusw m5, m3
+ paddusw m0, m6
+ paddusw m1, m6
+ paddusw m0, m4
+ paddusw m1, m5
+ psrlw m0, 2
+ psrlw m1, 2
+ mova m3, [r0+r4]
+ packuswb m0, m1
+ PAVGB m0, m3
+ mova [r0+r4], m0
+ add r4, r2
+ sub r3d, 2
+ jnz .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+AVG_PIXELS_XY2
+INIT_MMX 3dnow
+AVG_PIXELS_XY2
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 6e9f66340c..5e2ecb53a8 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -74,6 +74,10 @@ void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_approx_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
@@ -156,6 +160,7 @@ CALL_2X_PIXELS_EXPORT(ff_put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
CALL_2X_PIXELS(avg_pixels16 ## CPUEXT, ff_avg_pixels8 ## CPUEXT, 8) \
CALL_2X_PIXELS(avg_pixels16_x2 ## CPUEXT, ff_avg_pixels8_x2 ## CPUEXT, 8) \
CALL_2X_PIXELS(avg_pixels16_y2 ## CPUEXT, ff_avg_pixels8_y2 ## CPUEXT, 8) \
+ CALL_2X_PIXELS(avg_pixels16_xy2 ## CPUEXT, ff_avg_pixels8_xy2 ## CPUEXT, 8) \
CALL_2X_PIXELS(avg_approx_pixels16_xy2## CPUEXT, ff_avg_approx_pixels8_xy2## CPUEXT, 8)
HPELDSP_AVG_PIXELS16(_3dnow)
@@ -209,6 +214,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags)
c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
+ c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
@@ -216,6 +222,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags)
c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
+ c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
if (!(flags & CODEC_FLAG_BITEXACT)) {
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
@@ -243,6 +250,7 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags)
c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
+ c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
@@ -250,6 +258,7 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags)
c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
+ c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
if (!(flags & CODEC_FLAG_BITEXACT)){
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;