diff options
author | Clément Bœsch <u@pkh.me> | 2014-08-14 22:30:55 +0200 |
---|---|---|
committer | Clément Bœsch <u@pkh.me> | 2014-08-23 10:18:53 +0200 |
commit | 45c7f3997ea11c3d1007b2126b1c0049a8c27105 (patch) | |
tree | 4b197592039c1b5cfd458db0e12b604f94e0dca4 /libavutil/x86 | |
parent | c82a288f8747a92278ba2e1a8c30380c18254bbd (diff) | |
download | ffmpeg-45c7f3997ea11c3d1007b2126b1c0049a8c27105.tar.gz |
avutil/pixelutils: faster pixelutils_sad_[au]_16x16
~560 → ~500 decicycles
This is following the comments from Michael in
https://ffmpeg.org/pipermail/ffmpeg-devel/2014-August/160599.html
Using 2 registers for accumulator didn't help. On the other hand,
some re-ordering between the movs and psadbw allowed going ~538 to ~500.
Diffstat (limited to 'libavutil/x86')
-rw-r--r-- | libavutil/x86/pixelutils.asm | 14 |
1 files changed, 9 insertions, 5 deletions
diff --git a/libavutil/x86/pixelutils.asm b/libavutil/x86/pixelutils.asm index 8ab0a18355..15213d92d8 100644 --- a/libavutil/x86/pixelutils.asm +++ b/libavutil/x86/pixelutils.asm @@ -134,16 +134,20 @@ cglobal pixelutils_sad_16x16, 4,4,5, src1, stride1, src2, stride2 %macro SAD_XMM_16x16 1 INIT_XMM sse2 cglobal pixelutils_sad_%1_16x16, 4,4,3, src1, stride1, src2, stride2 - pxor m2, m2 -%rep 8 - mov%1 m0, [src2q] + mov%1 m2, [src2q] + psadbw m2, [src1q] mov%1 m1, [src2q + stride2q] - psadbw m0, [src1q] psadbw m1, [src1q + stride1q] - paddw m2, m0 paddw m2, m1 +%rep 7 lea src1q, [src1q + 2*stride1q] lea src2q, [src2q + 2*stride2q] + mov%1 m0, [src2q] + psadbw m0, [src1q] + mov%1 m1, [src2q + stride2q] + psadbw m1, [src1q + stride1q] + paddw m2, m0 + paddw m2, m1 %endrep movhlps m0, m2 paddw m2, m0 |