diff options
author | Daniel Kang <daniel.d.kang@gmail.com> | 2011-07-11 17:26:43 -0400 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2011-07-13 18:44:51 -0700 |
commit | ac4a85f47642b8d84c369bdb49b8d58611f1ca5e (patch) | |
tree | 3ce28f24cf8920eeb5c7a8c0fd9820c298737da2 /libavcodec | |
parent | e358f7ee90fec591348ca05dff94ebaf4c1a098b (diff) | |
download | ffmpeg-ac4a85f47642b8d84c369bdb49b8d58611f1ca5e.tar.gz |
H.264: Add more x86 assembly for 10-bit H.264 predict functions
Mainly ported from 8-bit H.264 predict.
Some code ported from x264. LGPL ok by author.
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/x86/h264_intrapred_10bit.asm | 231 | ||||
-rw-r--r-- | libavcodec/x86/h264_intrapred_init.c | 23 |
2 files changed, 253 insertions, 1 deletions
diff --git a/libavcodec/x86/h264_intrapred_10bit.asm b/libavcodec/x86/h264_intrapred_10bit.asm index d57fc79754..24a7bfa875 100644 --- a/libavcodec/x86/h264_intrapred_10bit.asm +++ b/libavcodec/x86/h264_intrapred_10bit.asm @@ -29,11 +29,19 @@ SECTION_RODATA SECTION .text +cextern pw_16 cextern pw_8 cextern pw_4 cextern pw_2 cextern pw_1 +pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4 +pw_m3: times 8 dw -3 +pw_pixel_max: times 8 dw ((1 << 10)-1) +pw_512: times 8 dw 512 +pd_17: times 4 dd 17 +pd_16: times 4 dd 16 + ; dest, left, right, src ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 %macro PRED4x4_LOWPASS 4 @@ -464,7 +472,92 @@ PRED8x8_TOP_DC mmxext, pshufw INIT_XMM PRED8x8_TOP_DC sse2 , pshuflw +;----------------------------------------------------------------------------- +; void pred8x8_plane(pixel *src, int stride) +;----------------------------------------------------------------------------- +INIT_XMM +cglobal pred8x8_plane_10_sse2, 2,7,7 + sub r0, r1 + lea r2, [r1+r1*2] + lea r3, [r0+r1*4] + mova m2, [r0] + pmaddwd m2, [pw_m32101234] + HADDD m2, m1 + movd m0, [r0-4] + psrld m0, 14 + psubw m2, m0 ; H + movd m0, [r3+r1*4-4] + movd m1, [r0+12] + paddw m0, m1 + psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7]) + movzx r4d, word [r3+r1*1-2] ; src[4*stride-1] + movzx r5d, word [r0+r2*1-2] ; src[2*stride-1] + sub r4d, r5d + movzx r6d, word [r3+r1*2-2] ; src[5*stride-1] + movzx r5d, word [r0+r1*2-2] ; src[1*stride-1] + sub r6d, r5d + lea r4d, [r4+r6*2] + movzx r5d, word [r3+r2*1-2] ; src[6*stride-1] + movzx r6d, word [r0+r1*1-2] ; src[0*stride-1] + sub r5d, r6d + lea r5d, [r5+r5*2] + add r4d, r5d + movzx r6d, word [r3+r1*4-2] ; src[7*stride-1] + movzx r5d, word [r0+r1*0-2] ; src[ -stride-1] + sub r6d, r5d + lea r4d, [r4+r6*4] + movd m3, r4d ; V + punpckldq m2, m3 + pmaddwd m2, [pd_17] + paddd m2, [pd_16] + psrad m2, 5 ; b, c + mova m3, [pw_pixel_max] + pxor m1, m1 + SPLATW m0, m0, 1 + SPLATW m4, m2, 2 + SPLATW m2, m2, 0 + pmullw m2, [pw_m32101234] ; b + pmullw m5, m4, [pw_m3] ; c + paddw m5, [pw_16] + mov r2d, 8 + add r0, r1 +.loop: + paddsw m6, m2, m5 + paddsw m6, m0 + psraw m6, 5 + CLIPW m6, m1, m3 + mova [r0], m6 + paddw m5, m4 + add r0, r1 + dec r2d + jg .loop + REP_RET + + +;----------------------------------------------------------------------------- +; void pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, int stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_128_DC 1 +cglobal pred8x8l_128_dc_10_%1, 4,4 + mova m0, [pw_512] + lea r1, [r3+r3*2] + lea r2, [r0+r3*4] + MOV8 r0+r3*0, m0, m0 + MOV8 r0+r3*1, m0, m0 + MOV8 r0+r3*2, m0, m0 + MOV8 r0+r1*1, m0, m0 + MOV8 r2+r3*0, m0, m0 + MOV8 r2+r3*1, m0, m0 + MOV8 r2+r3*2, m0, m0 + MOV8 r2+r1*1, m0, m0 + RET +%endmacro + +INIT_MMX +PRED8x8L_128_DC mmxext +INIT_XMM +PRED8x8L_128_DC sse2 ;----------------------------------------------------------------------------- ; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride) @@ -1258,7 +1351,7 @@ cglobal pred16x16_horizontal_10_%1, 2,3 MOV16 r0+r1*1, m1, m1, m1, m1 lea r0, [r0+r1*2] dec r2 - jge .vloop + jg .vloop REP_RET %endmacro @@ -1266,3 +1359,139 @@ INIT_MMX PRED16x16_HORIZONTAL mmxext INIT_XMM PRED16x16_HORIZONTAL sse2 + +;----------------------------------------------------------------------------- +; void pred16x16_dc(pixel *src, int stride) +;----------------------------------------------------------------------------- +%macro PRED16x16_DC 1 +cglobal pred16x16_dc_10_%1, 2,7 + mov r4, r0 + sub r0, r1 + mova m0, [r0+0] + paddw m0, [r0+mmsize] +%if mmsize==8 + paddw m0, [r0+16] + paddw m0, [r0+24] +%endif + HADDW m0, m2 + + sub r0, 2 + movzx r3d, word [r0+r1*1] + movzx r5d, word [r0+r1*2] +%rep 7 + lea r0, [r0+r1*2] + movzx r2d, word [r0+r1*1] + add r3d, r2d + movzx r2d, word [r0+r1*2] + add r5d, r2d +%endrep + lea r3d, [r3+r5+16] + + movd m1, r3d + paddw m0, m1 + psrlw m0, 5 + SPLATW m0, m0 + mov r3d, 8 +.loop: + MOV16 r4+r1*0, m0, m0, m0, m0 + MOV16 r4+r1*1, m0, m0, m0, m0 + lea r4, [r4+r1*2] + dec r3d + jg .loop + REP_RET +%endmacro + +INIT_MMX +PRED16x16_DC mmxext +INIT_XMM +PRED16x16_DC sse2 + +;----------------------------------------------------------------------------- +; void pred16x16_top_dc(pixel *src, int stride) +;----------------------------------------------------------------------------- +%macro PRED16x16_TOP_DC 1 +cglobal pred16x16_top_dc_10_%1, 2,3 + sub r0, r1 + mova m0, [r0+0] + paddw m0, [r0+mmsize] +%if mmsize==8 + paddw m0, [r0+16] + paddw m0, [r0+24] +%endif + HADDW m0, m2 + + SPLATW m0, m0 + paddw m0, [pw_8] + psrlw m0, 4 + mov r2d, 8 +.loop: + MOV16 r0+r1*1, m0, m0, m0, m0 + MOV16 r0+r1*2, m0, m0, m0, m0 + lea r0, [r0+r1*2] + dec r2d + jg .loop + REP_RET +%endmacro + +INIT_MMX +PRED16x16_TOP_DC mmxext +INIT_XMM +PRED16x16_TOP_DC sse2 + +;----------------------------------------------------------------------------- +; void pred16x16_left_dc(pixel *src, int stride) +;----------------------------------------------------------------------------- +%macro PRED16x16_LEFT_DC 1 +cglobal pred16x16_left_dc_10_%1, 2,7 + mov r4, r0 + + sub r0, 2 + movzx r5d, word [r0+r1*0] + movzx r6d, word [r0+r1*1] +%rep 7 + lea r0, [r0+r1*2] + movzx r2d, word [r0+r1*0] + movzx r3d, word [r0+r1*1] + add r5d, r2d + add r6d, r3d +%endrep + lea r2d, [r5+r6+8] + shr r2d, 4 + + movd m0, r2d + SPLATW m0, m0 + mov r3d, 8 +.loop: + MOV16 r4+r1*0, m0, m0, m0, m0 + MOV16 r4+r1*1, m0, m0, m0, m0 + lea r4, [r4+r1*2] + dec r3d + jg .loop + REP_RET +%endmacro + +INIT_MMX +PRED16x16_LEFT_DC mmxext +INIT_XMM +PRED16x16_LEFT_DC sse2 + +;----------------------------------------------------------------------------- +; void pred16x16_128_dc(pixel *src, int stride) +;----------------------------------------------------------------------------- +%macro PRED16x16_128_DC 1 +cglobal pred16x16_128_dc_10_%1, 2,3 + mova m0, [pw_512] + mov r2d, 8 +.loop: + MOV16 r0+r1*0, m0, m0, m0, m0 + MOV16 r0+r1*1, m0, m0, m0, m0 + lea r0, [r0+r1*2] + dec r2d + jg .loop + REP_RET +%endmacro + +INIT_MMX +PRED16x16_128_DC mmxext +INIT_XMM +PRED16x16_128_DC sse2 diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c index a0c5164db1..62e4c8796b 100644 --- a/libavcodec/x86/h264_intrapred_init.c +++ b/libavcodec/x86/h264_intrapred_init.c @@ -47,6 +47,7 @@ PRED8x8(dc, 10, mmxext) PRED8x8(dc, 10, sse2) PRED8x8(top_dc, 10, mmxext) PRED8x8(top_dc, 10, sse2) +PRED8x8(plane, 10, sse2) PRED8x8(vertical, 10, sse2) PRED8x8(horizontal, 10, sse2) @@ -55,6 +56,8 @@ void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int has_tople PRED8x8L(dc, 10, sse2) PRED8x8L(dc, 10, ssse3) +PRED8x8L(128_dc, 10, mmxext) +PRED8x8L(128_dc, 10, sse2) PRED8x8L(top_dc, 10, sse2) PRED8x8L(top_dc, 10, ssse3) PRED8x8L(vertical, 10, sse2) @@ -73,6 +76,14 @@ PRED8x8L(horizontal_up, 10, ssse3) #define PRED16x16(TYPE, DEPTH, OPT)\ void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride); +PRED16x16(dc, 10, mmxext) +PRED16x16(dc, 10, sse2) +PRED16x16(top_dc, 10, mmxext) +PRED16x16(top_dc, 10, sse2) +PRED16x16(128_dc, 10, mmxext) +PRED16x16(128_dc, 10, sse2) +PRED16x16(left_dc, 10, mmxext) +PRED16x16(left_dc, 10, sse2) PRED16x16(vertical, 10, mmxext) PRED16x16(vertical, 10, sse2) PRED16x16(horizontal, 10, mmxext) @@ -289,6 +300,12 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext; h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_mmxext; + h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_mmxext; + + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_mmxext; + h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_mmxext; + h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_mmxext; + h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_mmxext; h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_mmxext; h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_mmxext; } @@ -301,18 +318,24 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_sse2; h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_sse2; + h->pred8x8[PLANE_PRED8x8 ] = ff_pred8x8_plane_10_sse2; h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vertical_10_sse2; h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_10_sse2; h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_sse2; h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_sse2; h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_sse2; + h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_sse2; h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_sse2; h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_sse2; h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_sse2; h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_sse2; h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_sse2; + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_sse2; + h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_sse2; + h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_sse2; + h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_sse2; h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_sse2; h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_sse2; } |