diff options
author | Henrik Gramner <henrik@gramner.com> | 2017-09-17 22:52:13 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2017-09-18 23:24:55 -0300 |
commit | 18821e3ba1baa8e0fe037e11c77459ebc73f7e37 (patch) | |
tree | 0245e666aa1daed56c171e7ee3e50f39b6e89af3 | |
parent | 3ffd3b7f5f13080cdba7e8d6b5d9dd7c33ff2345 (diff) | |
download | ffmpeg-18821e3ba1baa8e0fe037e11c77459ebc73f7e37.tar.gz |
x86/exrdsp: optimize ff_reorder_pixels_avx2()
Tested with "checkasm --test=exrdsp -bench"
Before:
reorder_pixels_c: 5187.8
reorder_pixels_sse2: 377.0
reorder_pixels_avx2: 331.3
After:
reorder_pixels_c: 5181.5
reorder_pixels_sse2: 377.0
reorder_pixels_avx2: 313.8
Signed-off-by: James Almer <jamrial@gmail.com>
-rw-r--r-- | libavcodec/x86/exrdsp.asm | 13 |
1 files changed, 6 insertions, 7 deletions
diff --git a/libavcodec/x86/exrdsp.asm b/libavcodec/x86/exrdsp.asm index b91a7be20d..06c629e59e 100644 --- a/libavcodec/x86/exrdsp.asm +++ b/libavcodec/x86/exrdsp.asm @@ -39,16 +39,15 @@ cglobal reorder_pixels, 3,4,3, dst, src1, size, src2 neg sizeq ; size = offset for dst, src1, src2 .loop: -%if cpuflag(avx2) - vpermq m0, [src1q + sizeq], 0xd8; load first part - vpermq m1, [src2q + sizeq], 0xd8; load second part -%else mova m0, [src1q+sizeq] ; load first part movu m1, [src2q+sizeq] ; load second part -%endif SBUTTERFLY bw, 0, 1, 2 ; interleaved - mova [dstq+2*sizeq ], m0 ; copy to dst - mova [dstq+2*sizeq+mmsize], m1 + mova [dstq+2*sizeq ], xm0 ; copy to dst + mova [dstq+2*sizeq+16], xm1 +%if cpuflag(avx2) + vperm2i128 m0, m0, m1, q0301 + mova [dstq+2*sizeq+32], m0 +%endif add sizeq, mmsize jl .loop RET |