diff options
author | Christophe Gisquet <christophe.gisquet@gmail.com> | 2014-05-22 17:48:18 +0000 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2014-05-22 20:17:49 +0200 |
commit | c081ca851c9b84a41fafbce22c47a9b30b7bf8ad (patch) | |
tree | eb7ad0723126fb7834ec4814e6bc917fc9ec65a7 /libavcodec/x86 | |
parent | 17ac9980555a0afc5b4633a538c5170c8ceacc04 (diff) | |
download | ffmpeg-c081ca851c9b84a41fafbce22c47a9b30b7bf8ad.tar.gz |
x86: hpeldsp: avg_pixels_xy2 for mmx2&3dnow
This is a port of the inline assembly of the mmx version to use the
pavg(us|)b instruction.
8 16
mmx 1498 4355
mmx2 1242 3509
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86')
-rw-r--r-- | libavcodec/x86/hpeldsp.asm | 77 | ||||
-rw-r--r-- | libavcodec/x86/hpeldsp_init.c | 9 |
2 files changed, 86 insertions, 0 deletions
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm index 937b59bf4f..2adead218c 100644 --- a/libavcodec/x86/hpeldsp.asm +++ b/libavcodec/x86/hpeldsp.asm @@ -29,6 +29,7 @@ SECTION_RODATA cextern pb_1 +cextern pw_2 SECTION_TEXT @@ -494,3 +495,79 @@ INIT_MMX mmxext AVG_APPROX_PIXELS8_XY2 INIT_MMX 3dnow AVG_APPROX_PIXELS8_XY2 + + +; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) +%macro AVG_PIXELS_XY2 0 +%if cpuflag(sse2) +cglobal avg_pixels16_xy2, 4,5,8 +%else +cglobal avg_pixels8_xy2, 4,5 +%endif + pxor m7, m7 + mova m6, [pw_2] + movu m0, [r1] + movu m4, [r1+1] + mova m1, m0 + mova m5, m4 + punpcklbw m0, m7 + punpcklbw m4, m7 + punpckhbw m1, m7 + punpckhbw m5, m7 + paddusw m4, m0 + paddusw m5, m1 + xor r4, r4 + add r1, r2 +.loop: + movu m0, [r1+r4] + movu m2, [r1+r4+1] + mova m1, m0 + mova m3, m2 + punpcklbw m0, m7 + punpcklbw m2, m7 + punpckhbw m1, m7 + punpckhbw m3, m7 + paddusw m0, m2 + paddusw m1, m3 + paddusw m4, m6 + paddusw m5, m6 + paddusw m4, m0 + paddusw m5, m1 + psrlw m4, 2 + psrlw m5, 2 + mova m3, [r0+r4] + packuswb m4, m5 + PAVGB m4, m3 + mova [r0+r4], m4 + add r4, r2 + + movu m2, [r1+r4] + movu m4, [r1+r4+1] + mova m3, m2 + mova m5, m4 + punpcklbw m2, m7 + punpcklbw m4, m7 + punpckhbw m3, m7 + punpckhbw m5, m7 + paddusw m4, m2 + paddusw m5, m3 + paddusw m0, m6 + paddusw m1, m6 + paddusw m0, m4 + paddusw m1, m5 + psrlw m0, 2 + psrlw m1, 2 + mova m3, [r0+r4] + packuswb m0, m1 + PAVGB m0, m3 + mova [r0+r4], m0 + add r4, r2 + sub r3d, 2 + jnz .loop + REP_RET +%endmacro + +INIT_MMX mmxext +AVG_PIXELS_XY2 +INIT_MMX 3dnow +AVG_PIXELS_XY2 diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c index 6e9f66340c..5e2ecb53a8 100644 --- a/libavcodec/x86/hpeldsp_init.c +++ b/libavcodec/x86/hpeldsp_init.c @@ -74,6 +74,10 @@ void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_approx_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, @@ -156,6 +160,7 @@ CALL_2X_PIXELS_EXPORT(ff_put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8) CALL_2X_PIXELS(avg_pixels16 ## CPUEXT, ff_avg_pixels8 ## CPUEXT, 8) \ CALL_2X_PIXELS(avg_pixels16_x2 ## CPUEXT, ff_avg_pixels8_x2 ## CPUEXT, 8) \ CALL_2X_PIXELS(avg_pixels16_y2 ## CPUEXT, ff_avg_pixels8_y2 ## CPUEXT, 8) \ + CALL_2X_PIXELS(avg_pixels16_xy2 ## CPUEXT, ff_avg_pixels8_xy2 ## CPUEXT, 8) \ CALL_2X_PIXELS(avg_approx_pixels16_xy2## CPUEXT, ff_avg_approx_pixels8_xy2## CPUEXT, 8) HPELDSP_AVG_PIXELS16(_3dnow) @@ -209,6 +214,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags) c->avg_pixels_tab[0][0] = avg_pixels16_mmxext; c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext; c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext; + c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext; c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext; c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext; @@ -216,6 +222,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags) c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext; c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext; c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext; + c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext; if (!(flags & CODEC_FLAG_BITEXACT)) { c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext; @@ -243,6 +250,7 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags) c->avg_pixels_tab[0][0] = avg_pixels16_3dnow; c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; + c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow; c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow; @@ -250,6 +258,7 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags) c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow; c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow; c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow; + c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow; if (!(flags & CODEC_FLAG_BITEXACT)){ c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; |