diff options
author | Christophe Gisquet <christophe.gisquet@gmail.com> | 2014-05-22 17:48:19 +0000 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2014-05-24 03:29:48 +0200 |
commit | f0aca50e0b21d7c97b091f8e551719e0da574e12 (patch) | |
tree | 11152fa7e47b24ce24513ea75ccafdde41f22c82 /libavcodec | |
parent | 9eaa8c22bc40ce3c5c6911e65f4e021587088881 (diff) | |
download | ffmpeg-f0aca50e0b21d7c97b091f8e551719e0da574e12.tar.gz |
x86: hpeldsp: implement SSE2 versions
Those are mostly used in codecs older than H.264, eg MPEG-2.
put16 versions:
mmx mmx2 sse2
x2: 1888 1185 552
y2: 1778 1092 510
avg16 xy2: 3509(mmx2) -> 2169(sse2)
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/x86/hpeldsp.asm | 115 | ||||
-rw-r--r-- | libavcodec/x86/hpeldsp_init.c | 15 |
2 files changed, 100 insertions, 30 deletions
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm index 2adead218c..1d26c4516e 100644 --- a/libavcodec/x86/hpeldsp.asm +++ b/libavcodec/x86/hpeldsp.asm @@ -35,21 +35,39 @@ SECTION_TEXT ; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) %macro PUT_PIXELS8_X2 0 +%if cpuflag(sse2) +cglobal put_pixels16_x2, 4,5,4 +%else cglobal put_pixels8_x2, 4,5 +%endif lea r4, [r2*2] .loop: - mova m0, [r1] - mova m1, [r1+r2] - PAVGB m0, [r1+1] - PAVGB m1, [r1+r2+1] + movu m0, [r1+1] + movu m1, [r1+r2+1] +%if cpuflag(sse2) + movu m2, [r1] + movu m3, [r1+r2] + pavgb m0, m2 + pavgb m1, m3 +%else + PAVGB m0, [r1] + PAVGB m1, [r1+r2] +%endif mova [r0], m0 mova [r0+r2], m1 add r1, r4 add r0, r4 - mova m0, [r1] - mova m1, [r1+r2] - PAVGB m0, [r1+1] - PAVGB m1, [r1+r2+1] + movu m0, [r1+1] + movu m1, [r1+r2+1] +%if cpuflag(sse2) + movu m2, [r1] + movu m3, [r1+r2] + pavgb m0, m2 + pavgb m1, m3 +%else + PAVGB m0, [r1] + PAVGB m1, [r1+r2] +%endif add r1, r4 mova [r0], m0 mova [r0+r2], m1 @@ -107,6 +125,9 @@ INIT_MMX mmxext PUT_PIXELS_16 INIT_MMX 3dnow PUT_PIXELS_16 +; The 8_X2 macro can easily be used here +INIT_XMM sse2 +PUT_PIXELS8_X2 ; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) @@ -199,20 +220,24 @@ PUT_NO_RND_PIXELS8_X2_EXACT ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) %macro PUT_PIXELS8_Y2 0 +%if cpuflag(sse2) +cglobal put_pixels16_y2, 4,5,3 +%else cglobal put_pixels8_y2, 4,5 +%endif lea r4, [r2*2] - mova m0, [r1] + movu m0, [r1] sub r0, r2 .loop: - mova m1, [r1+r2] - mova m2, [r1+r4] + movu m1, [r1+r2] + movu m2, [r1+r4] add r1, r4 PAVGB m0, m1 PAVGB m1, m2 mova [r0+r2], m0 mova [r0+r4], m1 - mova m1, [r1+r2] - mova m0, [r1+r4] + movu m1, [r1+r2] + movu m0, [r1+r4] add r0, r4 add r1, r4 PAVGB m2, m1 @@ -229,6 +254,9 @@ INIT_MMX mmxext PUT_PIXELS8_Y2 INIT_MMX 3dnow PUT_PIXELS8_Y2 +; actually, put_pixels16_y2_sse2 +INIT_XMM sse2 +PUT_PIXELS8_Y2 ; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) @@ -352,34 +380,50 @@ AVG_PIXELS8 %endmacro %macro AVG_PIXELS8_X2 0 +%if cpuflag(sse2) +cglobal avg_pixels16_x2, 4,5,4 +%else cglobal avg_pixels8_x2, 4,5 +%endif lea r4, [r2*2] %if notcpuflag(mmxext) pcmpeqd m5, m5 paddb m5, m5 %endif .loop: - mova m0, [r1] - mova m2, [r1+r2] + movu m0, [r1] + movu m2, [r1+r2] %if notcpuflag(mmxext) PAVGB_MMX [r1+1], m0, m3, m5 PAVGB_MMX [r1+r2+1], m2, m4, m5 PAVGB_MMX [r0], m0, m3, m5 PAVGB_MMX [r0+r2], m2, m4, m5 %else +%if cpuflag(sse2) + movu m1, [r1+1] + movu m3, [r1+r2+1] + pavgb m0, m1 + pavgb m2, m3 +%else PAVGB m0, [r1+1] PAVGB m2, [r1+r2+1] +%endif PAVGB m0, [r0] PAVGB m2, [r0+r2] %endif add r1, r4 mova [r0], m0 mova [r0+r2], m2 - mova m0, [r1] - mova m2, [r1+r2] + movu m0, [r1] + movu m2, [r1+r2] %if notcpuflag(mmxext) PAVGB_MMX [r1+1], m0, m3, m5 PAVGB_MMX [r1+r2+1], m2, m4, m5 +%elif cpuflag(sse2) + movu m1, [r1+1] + movu m3, [r1+r2+1] + pavgb m0, m1 + pavgb m2, m3 %else PAVGB m0, [r1+1] PAVGB m2, [r1+r2+1] @@ -389,6 +433,9 @@ cglobal avg_pixels8_x2, 4,5 %if notcpuflag(mmxext) PAVGB_MMX [r0], m0, m3, m5 PAVGB_MMX [r0+r2], m2, m4, m5 +%elif cpuflag(sse2) + pavgb m0, [r0] + pavgb m2, [r0+r2] %else PAVGB m0, [r0] PAVGB m2, [r0+r2] @@ -407,36 +454,39 @@ INIT_MMX mmxext AVG_PIXELS8_X2 INIT_MMX 3dnow AVG_PIXELS8_X2 +; actually avg_pixels16_x2 +INIT_XMM sse2 +AVG_PIXELS8_X2 ; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) %macro AVG_PIXELS8_Y2 0 +%if cpuflag(sse2) +cglobal avg_pixels16_y2, 4,5,3 +%else cglobal avg_pixels8_y2, 4,5 +%endif lea r4, [r2*2] - mova m0, [r1] + movu m0, [r1] sub r0, r2 .loop: - mova m1, [r1+r2] - mova m2, [r1+r4] + movu m1, [r1+r2] + movu m2, [r1+r4] add r1, r4 PAVGB m0, m1 PAVGB m1, m2 - mova m3, [r0+r2] - mova m4, [r0+r4] - PAVGB m0, m3 - PAVGB m1, m4 + PAVGB m0, [r0+r2] + PAVGB m1, [r0+r4] mova [r0+r2], m0 mova [r0+r4], m1 - mova m1, [r1+r2] - mova m0, [r1+r4] + movu m1, [r1+r2] + movu m0, [r1+r4] PAVGB m2, m1 PAVGB m1, m0 add r0, r4 add r1, r4 - mova m3, [r0+r2] - mova m4, [r0+r4] - PAVGB m2, m3 - PAVGB m1, m4 + PAVGB m2, [r0+r2] + PAVGB m1, [r0+r4] mova [r0+r2], m2 mova [r0+r4], m1 add r0, r4 @@ -449,6 +499,9 @@ INIT_MMX mmxext AVG_PIXELS8_Y2 INIT_MMX 3dnow AVG_PIXELS8_Y2 +; actually avg_pixels16_y2 +INIT_XMM sse2 +AVG_PIXELS8_Y2 ; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) @@ -571,3 +624,5 @@ INIT_MMX mmxext AVG_PIXELS_XY2 INIT_MMX 3dnow AVG_PIXELS_XY2 +INIT_XMM sse2 +AVG_PIXELS_XY2 diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c index 5e2ecb53a8..05bd561f59 100644 --- a/libavcodec/x86/hpeldsp_init.c +++ b/libavcodec/x86/hpeldsp_init.c @@ -40,6 +40,16 @@ void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, @@ -284,7 +294,12 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int cpu_flags) // these functions are slower than mmx on AMD, but faster on Intel c->put_pixels_tab[0][0] = ff_put_pixels16_sse2; c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2; + c->put_pixels_tab[0][1] = ff_put_pixels16_x2_sse2; + c->put_pixels_tab[0][2] = ff_put_pixels16_y2_sse2; c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2; + c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2; + c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2; + c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_sse2; } #endif /* HAVE_SSE2_EXTERNAL */ } |