diff options
author | James Darnley <jdarnley@obe.tv> | 2017-06-02 15:20:19 +0200 |
---|---|---|
committer | James Darnley <jdarnley@obe.tv> | 2017-06-28 17:27:35 +0200 |
commit | d7246ea9f229db64ed909d7446196128d6f53de0 (patch) | |
tree | 62bf59089de984c19802bac0daf9b3dfb52230ff /libavcodec/x86/simple_idct10.asm | |
parent | 8b19467d07d5782b4140f61363f24361efb87ff6 (diff) | |
download | ffmpeg-d7246ea9f229db64ed909d7446196128d6f53de0.tar.gz |
avcodec/x86: add an 8-bit simple IDCT function based on the x86-64 high depth functions
Includes add/put functions
Rounding contributed by Ronald S. Bultje
Diffstat (limited to 'libavcodec/x86/simple_idct10.asm')
-rw-r--r-- | libavcodec/x86/simple_idct10.asm | 92 |
1 files changed, 92 insertions, 0 deletions
diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm index b492303a57..069bb61378 100644 --- a/libavcodec/x86/simple_idct10.asm +++ b/libavcodec/x86/simple_idct10.asm @@ -31,11 +31,14 @@ SECTION_RODATA cextern pw_2 cextern pw_16 +cextern pw_32 cextern pw_1023 cextern pw_4095 +pd_round_11: times 4 dd 1<<(11-1) pd_round_12: times 4 dd 1<<(12-1) pd_round_15: times 4 dd 1<<(15-1) pd_round_19: times 4 dd 1<<(19-1) +pd_round_20: times 4 dd 1<<(20-1) %macro CONST_DEC 3 const %1 @@ -77,8 +80,97 @@ CONST_DEC w3_min_w7_lo, W3sh2_lo, -W7sh2 SECTION .text +%macro STORE_HI_LO 12 + movq %1, %9 + movq %3, %10 + movq %5, %11 + movq %7, %12 + movhps %2, %9 + movhps %4, %10 + movhps %6, %11 + movhps %8, %12 +%endmacro + +%macro LOAD_ZXBW_8 16 + pmovzxbw %1, %9 + pmovzxbw %2, %10 + pmovzxbw %3, %11 + pmovzxbw %4, %12 + pmovzxbw %5, %13 + pmovzxbw %6, %14 + pmovzxbw %7, %15 + pmovzxbw %8, %16 +%endmacro + +%macro LOAD_ZXBW_4 9 + movh %1, %5 + movh %2, %6 + movh %3, %7 + movh %4, %8 + punpcklbw %1, %9 + punpcklbw %2, %9 + punpcklbw %3, %9 + punpcklbw %4, %9 +%endmacro + +%define PASS4ROWS(base, stride, stride3) \ + [base], [base + stride], [base + 2*stride], [base + stride3] + %macro idct_fn 0 +define_constants _lo + +cglobal simple_idct8, 1, 1, 16, 32, block + IDCT_FN "", 11, pw_32, 20, "store" +RET + +cglobal simple_idct8_put, 3, 4, 16, 32, pixels, lsize, block + IDCT_FN "", 11, pw_32, 20 + lea r3, [3*lsizeq] + lea r2, [pixelsq + r3] + packuswb m8, m0 + packuswb m1, m2 + packuswb m4, m11 + packuswb m9, m10 + STORE_HI_LO PASS8ROWS(pixelsq, r2, lsizeq, r3), m8, m1, m4, m9 +RET + +cglobal simple_idct8_add, 3, 4, 16, 32, pixels, lsize, block + IDCT_FN "", 11, pw_32, 20 + lea r2, [3*lsizeq] + %if cpuflag(sse4) + lea r3, [pixelsq + r2] + LOAD_ZXBW_8 m3, m5, m6, m7, m12, m13, m14, m15, PASS8ROWS(pixelsq, r3, lsizeq, r2) + paddsw m8, m3 + paddsw m0, m5 + paddsw m1, m6 + paddsw m2, m7 + paddsw m4, m12 + paddsw m11, m13 + paddsw m9, m14 + paddsw m10, m15 + %else + pxor m12, m12 + LOAD_ZXBW_4 m3, m5, m6, m7, PASS4ROWS(pixelsq, lsizeq, r2), m12 + paddsw m8, m3 + paddsw m0, m5 + paddsw m1, m6 + paddsw m2, m7 + lea r3, [pixelsq + 4*lsizeq] + LOAD_ZXBW_4 m3, m5, m6, m7, PASS4ROWS(r3, lsizeq, r2), m12 + paddsw m4, m3 + paddsw m11, m5 + paddsw m9, m6 + paddsw m10, m7 + lea r3, [pixelsq + r2] + %endif + packuswb m8, m0 + packuswb m1, m2 + packuswb m4, m11 + packuswb m9, m10 + STORE_HI_LO PASS8ROWS(pixelsq, r3, lsizeq, r2), m8, m1, m4, m9 +RET + define_constants _hi cglobal simple_idct10, 1, 1, 16, block |