diff options
author | James Almer <jamrial@gmail.com> | 2015-12-27 17:44:48 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2015-12-28 17:20:12 -0300 |
commit | 6e243d17e911bf20cb0f0dd3eccf10245dfaf2cb (patch) | |
tree | a47ae62ad01918d4c1793a4d0e8cc0a6c326830a /libavfilter | |
parent | 4020787b5bbd2ac159c860d1d1ec983837bf31bb (diff) | |
download | ffmpeg-6e243d17e911bf20cb0f0dd3eccf10245dfaf2cb.tar.gz |
x86/vf_stereo3d: optimize register usage
Reviewed-by: Paul B Mahol <onemda@gmail.com>
Signed-off-by: James Almer <jamrial@gmail.com>
Diffstat (limited to 'libavfilter')
-rw-r--r-- | libavfilter/x86/vf_stereo3d.asm | 164 |
1 files changed, 86 insertions, 78 deletions
diff --git a/libavfilter/x86/vf_stereo3d.asm b/libavfilter/x86/vf_stereo3d.asm index 94a0473290..29a8c56202 100644 --- a/libavfilter/x86/vf_stereo3d.asm +++ b/libavfilter/x86/vf_stereo3d.asm @@ -37,125 +37,133 @@ ex_b: db 2,-1,-1,-1,5,-1,-1,-1,8,-1,-1,-1,11,-1,-1,-1 SECTION .text INIT_XMM sse4 -cglobal anaglyph, 11, 13, 16, 2*6*mmsize, dst, lsrc, rsrc, dst_linesize, l_linesize, r_linesize, width, height, ana_matrix_r, ana_matrix_g, ana_matrix_b - movu m13, [ana_matrix_rq+ 0] - movq m15, [ana_matrix_rq+16] - pshufd m10, m13, q0000 - pshufd m11, m13, q1111 - pshufd m12, m13, q2222 - pshufd m13, m13, q3333 - pshufd m14, m15, q0000 - pshufd m15, m15, q1111 - mova [rsp+mmsize*0], m10 - mova [rsp+mmsize*1], m11 - mova [rsp+mmsize*2], m12 - mova [rsp+mmsize*3], m13 - mova [rsp+mmsize*4], m14 - mova [rsp+mmsize*5], m15 - - movu m13, [ana_matrix_gq+ 0] - movq m15, [ana_matrix_gq+16] - pshufd m10, m13, q0000 - pshufd m11, m13, q1111 - pshufd m12, m13, q2222 - pshufd m13, m13, q3333 - pshufd m14, m15, q0000 - pshufd m15, m15, q1111 - mova [rsp+mmsize*6 ], m10 - mova [rsp+mmsize*7 ], m11 - mova [rsp+mmsize*8 ], m12 - mova [rsp+mmsize*9 ], m13 - mova [rsp+mmsize*10], m14 - mova [rsp+mmsize*11], m15 - - movu m13, [ana_matrix_bq+ 0] - movq m15, [ana_matrix_bq+16] - pshufd m10, m13, q0000 - pshufd m11, m13, q1111 - pshufd m12, m13, q2222 - pshufd m13, m13, q3333 - pshufd m14, m15, q0000 - pshufd m15, m15, q1111 +cglobal anaglyph, 6, 10, 14, 2*6*mmsize, dst, lsrc, rsrc, dst_linesize, l_linesize, r_linesize, width, height, o, cnt +%define ana_matrix_rq r6q +%define ana_matrix_gq r7q +%define ana_matrix_bq r8q + mov ana_matrix_rq, r8m + mov ana_matrix_gq, r9m + mov ana_matrix_bq, r10m + movu m3, [ana_matrix_rq+ 0] + movq m5, [ana_matrix_rq+16] + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + pshufd m4, m5, q0000 + pshufd m5, m5, q1111 + mova [rsp+mmsize*0], m0 + mova [rsp+mmsize*1], m1 + mova [rsp+mmsize*2], m2 + mova [rsp+mmsize*3], m3 + mova [rsp+mmsize*4], m4 + mova [rsp+mmsize*5], m5 + + movu m3, [ana_matrix_gq+ 0] + movq m5, [ana_matrix_gq+16] + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + pshufd m4, m5, q0000 + pshufd m5, m5, q1111 + mova [rsp+mmsize*6 ], m0 + mova [rsp+mmsize*7 ], m1 + mova [rsp+mmsize*8 ], m2 + mova [rsp+mmsize*9 ], m3 + mova [rsp+mmsize*10], m4 + mova [rsp+mmsize*11], m5 + + movu m11, [ana_matrix_bq+ 0] + movq m13, [ana_matrix_bq+16] + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 + pshufd m12, m13, q0000 + pshufd m13, m13, q1111 + mov widthd, dword widthm + mov heightd, dword heightm + .nextrow: - mov r11q, widthq - mov r12q, 0 - %define o r12q + mov od, widthd + xor cntd, cntd .loop: - movu m0, [lsrcq+o+0] + movu m0, [lsrcq+cntq] pshufb m1, m0, [ex_r] pshufb m2, m0, [ex_g] pshufb m3, m0, [ex_b] - movu m0, [rsrcq+o+0] + movu m0, [rsrcq+cntq] pshufb m4, m0, [ex_r] pshufb m5, m0, [ex_g] - pshufb m6, m0, [ex_b] + pshufb m0, [ex_b] pmulld m1, [rsp+mmsize*0] pmulld m2, [rsp+mmsize*1] pmulld m3, [rsp+mmsize*2] pmulld m4, [rsp+mmsize*3] pmulld m5, [rsp+mmsize*4] - pmulld m6, [rsp+mmsize*5] + pmulld m0, [rsp+mmsize*5] paddd m1, m2 paddd m3, m4 - paddd m5, m6 + paddd m5, m0 paddd m1, m3 paddd m1, m5 - movu m0, [lsrcq+o+0] + movu m0, [lsrcq+cntq] pshufb m7, m0, [ex_r] pshufb m2, m0, [ex_g] pshufb m3, m0, [ex_b] - movu m0, [rsrcq+o+0] + movu m0, [rsrcq+cntq] pshufb m4, m0, [ex_r] pshufb m5, m0, [ex_g] - pshufb m6, m0, [ex_b] + pshufb m0, [ex_b] pmulld m7, [rsp+mmsize*6] pmulld m2, [rsp+mmsize*7] pmulld m3, [rsp+mmsize*8] pmulld m4, [rsp+mmsize*9] pmulld m5, [rsp+mmsize*10] - pmulld m6, [rsp+mmsize*11] + pmulld m0, [rsp+mmsize*11] paddd m7, m2 paddd m3, m4 - paddd m5, m6 + paddd m5, m0 paddd m7, m3 paddd m7, m5 - movu m0, [lsrcq+o+0] - pshufb m8, m0, [ex_r] - pshufb m2, m0, [ex_g] - pshufb m3, m0, [ex_b] - movu m0, [rsrcq+o+0] - pshufb m4, m0, [ex_r] - pshufb m5, m0, [ex_g] - pshufb m6, m0, [ex_b] - pmulld m8, m10 - pmulld m2, m11 - pmulld m3, m12 - pmulld m4, m13 - pmulld m5, m14 - pmulld m6, m15 - paddd m8, m2 - paddd m3, m4 - paddd m5, m6 - paddd m8, m3 - paddd m8, m5 + movu m0, [lsrcq+cntq] + pshufb m2, m0, [ex_r] + pshufb m3, m0, [ex_g] + pshufb m4, m0, [ex_b] + movu m0, [rsrcq+cntq] + pshufb m5, m0, [ex_r] + pshufb m6, m0, [ex_g] + pshufb m0, [ex_b] + pmulld m2, m8 + pmulld m3, m9 + pmulld m4, m10 + pmulld m5, m11 + pmulld m6, m12 + pmulld m0, m13 + paddd m2, m3 + paddd m4, m5 + paddd m6, m0 + paddd m2, m4 + paddd m2, m6 psrld m1, 16 psrld m7, 16 - psrld m8, 16 + psrld m2, 16 packusdw m1, m7 - packusdw m8, m8 - packuswb m1, m8 + packusdw m2, m2 + packuswb m1, m2 pshufb m1, [shuf] - movq [dstq+o+0], m1 + movq [dstq+cntq+0], m1 psrldq m1, 8 - movd [dstq+o+8], m1 - add r12d, 12 - sub r11d, 4 + movd [dstq+cntq+8], m1 + add cntd, 12 + sub od, 4 jg .loop add dstq, dst_linesizeq |