diff options
author | James Almer <jamrial@gmail.com> | 2019-02-20 15:42:01 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2019-02-20 15:42:01 -0300 |
commit | e4e04dce1fab81bcdef82e60184d50c73d212c6a (patch) | |
tree | be2ac46125bd9142bd8c86f60dc7616d48fa2709 /libavcodec/aarch64/h264dsp_neon.S | |
parent | 4dc1f06f0c84ebbd8b26cd77679450903244a3e8 (diff) | |
parent | 28a8b5413b64b831dfb8650208bccd8b78360484 (diff) | |
download | ffmpeg-e4e04dce1fab81bcdef82e60184d50c73d212c6a.tar.gz |
Merge commit '28a8b5413b64b831dfb8650208bccd8b78360484'
* commit '28a8b5413b64b831dfb8650208bccd8b78360484':
h264/aarch64: add intra loop filter neon asm
Merged-by: James Almer <jamrial@gmail.com>
Diffstat (limited to 'libavcodec/aarch64/h264dsp_neon.S')
-rw-r--r-- | libavcodec/aarch64/h264dsp_neon.S | 297 |
1 files changed, 297 insertions, 0 deletions
diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S index 3b759d4d4a..80ac09d2be 100644 --- a/libavcodec/aarch64/h264dsp_neon.S +++ b/libavcodec/aarch64/h264dsp_neon.S @@ -1,6 +1,7 @@ /* * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> + * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> * * This file is part of FFmpeg. * @@ -181,6 +182,203 @@ function ff_h264_h_loop_filter_luma_neon, export=1 ret endfunc + +.macro h264_loop_filter_start_intra + orr w4, w2, w3 + cbnz w4, 1f + ret +1: + sxtw x1, w1 + dup v30.16b, w2 // alpha + dup v31.16b, w3 // beta +.endm + +.macro h264_loop_filter_luma_intra + uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0) + uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0) + uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0) + cmhi v19.16b, v30.16b, v16.16b // < alpha + cmhi v17.16b, v31.16b, v17.16b // < beta + cmhi v18.16b, v31.16b, v18.16b // < beta + + movi v29.16b, #2 + ushr v30.16b, v30.16b, #2 // alpha >> 2 + add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2 + cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2 + + and v19.16b, v19.16b, v17.16b + and v19.16b, v19.16b, v18.16b + shrn v20.8b, v19.8h, #4 + mov x4, v20.d[0] + cbz x4, 9f + + ushll v20.8h, v6.8b, #1 + ushll v22.8h, v1.8b, #1 + ushll2 v21.8h, v6.16b, #1 + ushll2 v23.8h, v1.16b, #1 + uaddw v20.8h, v20.8h, v7.8b + uaddw v22.8h, v22.8h, v0.8b + uaddw2 v21.8h, v21.8h, v7.16b + uaddw2 v23.8h, v23.8h, v0.16b + uaddw v20.8h, v20.8h, v1.8b + uaddw v22.8h, v22.8h, v6.8b + uaddw2 v21.8h, v21.8h, v1.16b + uaddw2 v23.8h, v23.8h, v6.16b + + rshrn v24.8b, v20.8h, #2 // p0'_1 + rshrn v25.8b, v22.8h, #2 // q0'_1 + rshrn2 v24.16b, v21.8h, #2 // p0'_1 + rshrn2 v25.16b, v23.8h, #2 // q0'_1 + + uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0) + uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0) + cmhi v17.16b, v31.16b, v17.16b // < beta + cmhi v18.16b, v31.16b, v18.16b // < beta + + and v17.16b, v16.16b, v17.16b // if_2 && if_3 + and v18.16b, v16.16b, v18.16b // if_2 && if_4 + + not v30.16b, v17.16b + not v31.16b, v18.16b + + and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3) + and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4) + + and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3 + and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4 + + //calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4 + uaddl v26.8h, v5.8b, v7.8b + uaddl2 v27.8h, v5.16b, v7.16b + uaddw v26.8h, v26.8h, v0.8b + uaddw2 v27.8h, v27.8h, v0.16b + add v20.8h, v20.8h, v26.8h + add v21.8h, v21.8h, v27.8h + uaddw v20.8h, v20.8h, v0.8b + uaddw2 v21.8h, v21.8h, v0.16b + rshrn v20.8b, v20.8h, #3 // p0'_2 + rshrn2 v20.16b, v21.8h, #3 // p0'_2 + uaddw v26.8h, v26.8h, v6.8b + uaddw2 v27.8h, v27.8h, v6.16b + rshrn v21.8b, v26.8h, #2 // p1'_2 + rshrn2 v21.16b, v27.8h, #2 // p1'_2 + uaddl v28.8h, v4.8b, v5.8b + uaddl2 v29.8h, v4.16b, v5.16b + shl v28.8h, v28.8h, #1 + shl v29.8h, v29.8h, #1 + add v28.8h, v28.8h, v26.8h + add v29.8h, v29.8h, v27.8h + rshrn v19.8b, v28.8h, #3 // p2'_2 + rshrn2 v19.16b, v29.8h, #3 // p2'_2 + + //calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3 + uaddl v26.8h, v2.8b, v0.8b + uaddl2 v27.8h, v2.16b, v0.16b + uaddw v26.8h, v26.8h, v7.8b + uaddw2 v27.8h, v27.8h, v7.16b + add v22.8h, v22.8h, v26.8h + add v23.8h, v23.8h, v27.8h + uaddw v22.8h, v22.8h, v7.8b + uaddw2 v23.8h, v23.8h, v7.16b + rshrn v22.8b, v22.8h, #3 // q0'_2 + rshrn2 v22.16b, v23.8h, #3 // q0'_2 + uaddw v26.8h, v26.8h, v1.8b + uaddw2 v27.8h, v27.8h, v1.16b + rshrn v23.8b, v26.8h, #2 // q1'_2 + rshrn2 v23.16b, v27.8h, #2 // q1'_2 + uaddl v28.8h, v2.8b, v3.8b + uaddl2 v29.8h, v2.16b, v3.16b + shl v28.8h, v28.8h, #1 + shl v29.8h, v29.8h, #1 + add v28.8h, v28.8h, v26.8h + add v29.8h, v29.8h, v27.8h + rshrn v26.8b, v28.8h, #3 // q2'_2 + rshrn2 v26.16b, v29.8h, #3 // q2'_2 + + bit v7.16b, v24.16b, v30.16b // p0'_1 + bit v0.16b, v25.16b, v31.16b // q0'_1 + bit v7.16b, v20.16b, v17.16b // p0'_2 + bit v6.16b, v21.16b, v17.16b // p1'_2 + bit v5.16b, v19.16b, v17.16b // p2'_2 + bit v0.16b, v22.16b, v18.16b // q0'_2 + bit v1.16b, v23.16b, v18.16b // q1'_2 + bit v2.16b, v26.16b, v18.16b // q2'_2 +.endm + +function ff_h264_v_loop_filter_luma_intra_neon, export=1 + h264_loop_filter_start_intra + + ld1 {v0.16b}, [x0], x1 // q0 + ld1 {v1.16b}, [x0], x1 // q1 + ld1 {v2.16b}, [x0], x1 // q2 + ld1 {v3.16b}, [x0], x1 // q3 + sub x0, x0, x1, lsl #3 + ld1 {v4.16b}, [x0], x1 // p3 + ld1 {v5.16b}, [x0], x1 // p2 + ld1 {v6.16b}, [x0], x1 // p1 + ld1 {v7.16b}, [x0] // p0 + + h264_loop_filter_luma_intra + + sub x0, x0, x1, lsl #1 + st1 {v5.16b}, [x0], x1 // p2 + st1 {v6.16b}, [x0], x1 // p1 + st1 {v7.16b}, [x0], x1 // p0 + st1 {v0.16b}, [x0], x1 // q0 + st1 {v1.16b}, [x0], x1 // q1 + st1 {v2.16b}, [x0] // q2 +9: + ret +endfunc + +function ff_h264_h_loop_filter_luma_intra_neon, export=1 + h264_loop_filter_start_intra + + sub x0, x0, #4 + ld1 {v4.8b}, [x0], x1 + ld1 {v5.8b}, [x0], x1 + ld1 {v6.8b}, [x0], x1 + ld1 {v7.8b}, [x0], x1 + ld1 {v0.8b}, [x0], x1 + ld1 {v1.8b}, [x0], x1 + ld1 {v2.8b}, [x0], x1 + ld1 {v3.8b}, [x0], x1 + ld1 {v4.d}[1], [x0], x1 + ld1 {v5.d}[1], [x0], x1 + ld1 {v6.d}[1], [x0], x1 + ld1 {v7.d}[1], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ld1 {v1.d}[1], [x0], x1 + ld1 {v2.d}[1], [x0], x1 + ld1 {v3.d}[1], [x0], x1 + + transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23 + + h264_loop_filter_luma_intra + + transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23 + + sub x0, x0, x1, lsl #4 + st1 {v4.8b}, [x0], x1 + st1 {v5.8b}, [x0], x1 + st1 {v6.8b}, [x0], x1 + st1 {v7.8b}, [x0], x1 + st1 {v0.8b}, [x0], x1 + st1 {v1.8b}, [x0], x1 + st1 {v2.8b}, [x0], x1 + st1 {v3.8b}, [x0], x1 + st1 {v4.d}[1], [x0], x1 + st1 {v5.d}[1], [x0], x1 + st1 {v6.d}[1], [x0], x1 + st1 {v7.d}[1], [x0], x1 + st1 {v0.d}[1], [x0], x1 + st1 {v1.d}[1], [x0], x1 + st1 {v2.d}[1], [x0], x1 + st1 {v3.d}[1], [x0], x1 +9: + ret +endfunc + .macro h264_loop_filter_chroma dup v22.8B, w2 // alpha dup v23.8B, w3 // beta @@ -266,6 +464,105 @@ function ff_h264_h_loop_filter_chroma_neon, export=1 ret endfunc + +.macro h264_loop_filter_chroma_intra + uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0) + uabd v27.8b, v18.8b, v16.8b // abs(p1 - p0) + uabd v28.8b, v19.8b, v17.8b // abs(q1 - q0) + cmhi v26.8b, v30.8b, v26.8b // < alpha + cmhi v27.8b, v31.8b, v27.8b // < beta + cmhi v28.8b, v31.8b, v28.8b // < beta + and v26.8b, v26.8b, v27.8b + and v26.8b, v26.8b, v28.8b + mov x2, v26.d[0] + + ushll v4.8h, v18.8b, #1 + ushll v6.8h, v19.8b, #1 + cbz x2, 9f + uaddl v20.8h, v16.8b, v19.8b + uaddl v22.8h, v17.8b, v18.8b + add v20.8h, v20.8h, v4.8h + add v22.8h, v22.8h, v6.8h + uqrshrn v24.8b, v20.8h, #2 + uqrshrn v25.8b, v22.8h, #2 + bit v16.8b, v24.8b, v26.8b + bit v17.8b, v25.8b, v26.8b +.endm + +function ff_h264_v_loop_filter_chroma_intra_neon, export=1 + h264_loop_filter_start_intra + + sub x0, x0, x1, lsl #1 + ld1 {v18.8b}, [x0], x1 + ld1 {v16.8b}, [x0], x1 + ld1 {v17.8b}, [x0], x1 + ld1 {v19.8b}, [x0] + + h264_loop_filter_chroma_intra + + sub x0, x0, x1, lsl #1 + st1 {v16.8b}, [x0], x1 + st1 {v17.8b}, [x0], x1 + +9: + ret +endfunc + +function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1 + h264_loop_filter_start_intra + + sub x4, x0, #2 + sub x0, x0, #1 + ld1 {v18.8b}, [x4], x1 + ld1 {v16.8b}, [x4], x1 + ld1 {v17.8b}, [x4], x1 + ld1 {v19.8b}, [x4], x1 + + transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 + + h264_loop_filter_chroma_intra + + st2 {v16.b,v17.b}[0], [x0], x1 + st2 {v16.b,v17.b}[1], [x0], x1 + st2 {v16.b,v17.b}[2], [x0], x1 + st2 {v16.b,v17.b}[3], [x0], x1 + +9: + ret +endfunc + +function ff_h264_h_loop_filter_chroma_intra_neon, export=1 + h264_loop_filter_start_intra + + sub x4, x0, #2 + sub x0, x0, #1 + ld1 {v18.8b}, [x4], x1 + ld1 {v16.8b}, [x4], x1 + ld1 {v17.8b}, [x4], x1 + ld1 {v19.8b}, [x4], x1 + ld1 {v18.s}[1], [x4], x1 + ld1 {v16.s}[1], [x4], x1 + ld1 {v17.s}[1], [x4], x1 + ld1 {v19.s}[1], [x4] + + transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 + + h264_loop_filter_chroma_intra + + st2 {v16.b,v17.b}[0], [x0], x1 + st2 {v16.b,v17.b}[1], [x0], x1 + st2 {v16.b,v17.b}[2], [x0], x1 + st2 {v16.b,v17.b}[3], [x0], x1 + st2 {v16.b,v17.b}[4], [x0], x1 + st2 {v16.b,v17.b}[5], [x0], x1 + st2 {v16.b,v17.b}[6], [x0], x1 + st2 {v16.b,v17.b}[7], [x0], x1 + +9: + ret +endfunc + + .macro biweight_16 macs, macd dup v0.16B, w5 dup v1.16B, w6 |