aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/aarch64
diff options
context:
space:
mode:
authorJanne Grunau <janne-libav@jannau.net>2014-01-10 14:07:24 +0100
committerJanne Grunau <janne-libav@jannau.net>2014-01-15 12:31:07 +0100
commitf896bca03fc63b93851c1c14c9321c20b3cd44a6 (patch)
tree082492458722a4699eca8d2b795299a703d3df34 /libavcodec/aarch64
parent36e3b1f2fd262028834a9d7b1eb533c1218ee6c2 (diff)
downloadffmpeg-f896bca03fc63b93851c1c14c9321c20b3cd44a6.tar.gz
aarch64: h264 (bi)weight NEON optimizations
Ported from ARMv7 NEON.
Diffstat (limited to 'libavcodec/aarch64')
-rw-r--r--libavcodec/aarch64/h264dsp_init_aarch64.c25
-rw-r--r--libavcodec/aarch64/h264dsp_neon.S239
2 files changed, 264 insertions, 0 deletions
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
index 307b30cd38..b106f11134 100644
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c
+++ b/libavcodec/aarch64/h264dsp_init_aarch64.c
@@ -34,6 +34,23 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
+void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
+ int log2_den, int weight, int offset);
+void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
+ int log2_den, int weight, int offset);
+void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
+ int log2_den, int weight, int offset);
+
+void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
+ int height, int log2_den, int weightd,
+ int weights, int offset);
+void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
+ int height, int log2_den, int weightd,
+ int weights, int offset);
+void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
+ int height, int log2_den, int weightd,
+ int weights, int offset);
+
void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride);
void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
@@ -63,6 +80,14 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
+ c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
+ c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
+ c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
+
+ c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
+ c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
+ c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
+
c->h264_idct_add = ff_h264_idct_add_neon;
c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
c->h264_idct_add16 = ff_h264_idct_add16_neon;
diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S
index 777ddefa2a..9b4610a4d4 100644
--- a/libavcodec/aarch64/h264dsp_neon.S
+++ b/libavcodec/aarch64/h264dsp_neon.S
@@ -257,3 +257,242 @@ function ff_h264_h_loop_filter_chroma_neon, export=1
ret
endfunc
+
+.macro biweight_16 macs, macd
+ dup v0.16B, w5
+ dup v1.16B, w6
+ mov v4.16B, v16.16B
+ mov v6.16B, v16.16B
+1: subs w3, w3, #2
+ ld1 {v20.16B}, [x0], x2
+ \macd v4.8H, v0.8B, v20.8B
+ \macd\()2 v6.8H, v0.16B, v20.16B
+ ld1 {v22.16B}, [x1], x2
+ \macs v4.8H, v1.8B, v22.8B
+ \macs\()2 v6.8H, v1.16B, v22.16B
+ mov v24.16B, v16.16B
+ ld1 {v28.16B}, [x0], x2
+ mov v26.16B, v16.16B
+ \macd v24.8H, v0.8B, v28.8B
+ \macd\()2 v26.8H, v0.16B, v28.16B
+ ld1 {v30.16B}, [x1], x2
+ \macs v24.8H, v1.8B, v30.8B
+ \macs\()2 v26.8H, v1.16B, v30.16B
+ sshl v4.8H, v4.8H, v18.8H
+ sshl v6.8H, v6.8H, v18.8H
+ sqxtun v4.8B, v4.8H
+ sqxtun2 v4.16B, v6.8H
+ sshl v24.8H, v24.8H, v18.8H
+ sshl v26.8H, v26.8H, v18.8H
+ sqxtun v24.8B, v24.8H
+ sqxtun2 v24.16B, v26.8H
+ mov v6.16B, v16.16B
+ st1 {v4.16B}, [x7], x2
+ mov v4.16B, v16.16B
+ st1 {v24.16B}, [x7], x2
+ b.ne 1b
+ ret
+.endm
+
+.macro biweight_8 macs, macd
+ dup v0.8B, w5
+ dup v1.8B, w6
+ mov v2.16B, v16.16B
+ mov v20.16B, v16.16B
+1: subs w3, w3, #2
+ ld1 {v4.8B}, [x0], x2
+ \macd v2.8H, v0.8B, v4.8B
+ ld1 {v5.8B}, [x1], x2
+ \macs v2.8H, v1.8B, v5.8B
+ ld1 {v6.8B}, [x0], x2
+ \macd v20.8H, v0.8B, v6.8B
+ ld1 {v7.8B}, [x1], x2
+ \macs v20.8H, v1.8B, v7.8B
+ sshl v2.8H, v2.8H, v18.8H
+ sqxtun v2.8B, v2.8H
+ sshl v20.8H, v20.8H, v18.8H
+ sqxtun v4.8B, v20.8H
+ mov v20.16B, v16.16B
+ st1 {v2.8B}, [x7], x2
+ mov v2.16B, v16.16B
+ st1 {v4.8B}, [x7], x2
+ b.ne 1b
+ ret
+.endm
+
+.macro biweight_4 macs, macd
+ dup v0.8B, w5
+ dup v1.8B, w6
+ mov v2.16B, v16.16B
+ mov v20.16B,v16.16B
+1: subs w3, w3, #4
+ ld1 {v4.S}[0], [x0], x2
+ ld1 {v4.S}[1], [x0], x2
+ \macd v2.8H, v0.8B, v4.8B
+ ld1 {v5.S}[0], [x1], x2
+ ld1 {v5.S}[1], [x1], x2
+ \macs v2.8H, v1.8B, v5.8B
+ b.lt 2f
+ ld1 {v6.S}[0], [x0], x2
+ ld1 {v6.S}[1], [x0], x2
+ \macd v20.8H, v0.8B, v6.8B
+ ld1 {v7.S}[0], [x1], x2
+ ld1 {v7.S}[1], [x1], x2
+ \macs v20.8H, v1.8B, v7.8B
+ sshl v2.8H, v2.8H, v18.8H
+ sqxtun v2.8B, v2.8H
+ sshl v20.8H, v20.8H, v18.8H
+ sqxtun v4.8B, v20.8H
+ mov v20.16B, v16.16B
+ st1 {v2.S}[0], [x7], x2
+ st1 {v2.S}[1], [x7], x2
+ mov v2.16B, v16.16B
+ st1 {v4.S}[0], [x7], x2
+ st1 {v4.S}[1], [x7], x2
+ b.ne 1b
+ ret
+2: sshl v2.8H, v2.8H, v18.8H
+ sqxtun v2.8B, v2.8H
+ st1 {v2.S}[0], [x7], x2
+ st1 {v2.S}[1], [x7], x2
+ ret
+.endm
+
+.macro biweight_func w
+function ff_biweight_h264_pixels_\w\()_neon, export=1
+ sxtw x2, w2
+ lsr w8, w5, #31
+ add w7, w7, #1
+ eor w8, w8, w6, lsr #30
+ orr w7, w7, #1
+ dup v18.8H, w4
+ lsl w7, w7, w4
+ not v18.16B, v18.16B
+ dup v16.8H, w7
+ mov x7, x0
+ cbz w8, 10f
+ subs w8, w8, #1
+ b.eq 20f
+ subs w8, w8, #1
+ b.eq 30f
+ b 40f
+10: biweight_\w umlal, umlal
+20: neg w5, w5
+ biweight_\w umlal, umlsl
+30: neg w5, w5
+ neg w6, w6
+ biweight_\w umlsl, umlsl
+40: neg w6, w6
+ biweight_\w umlsl, umlal
+endfunc
+.endm
+
+ biweight_func 16
+ biweight_func 8
+ biweight_func 4
+
+.macro weight_16 add
+ dup v0.16B, w4
+1: subs w2, w2, #2
+ ld1 {v20.16B}, [x0], x1
+ umull v4.8H, v0.8B, v20.8B
+ umull2 v6.8H, v0.16B, v20.16B
+ ld1 {v28.16B}, [x0], x1
+ umull v24.8H, v0.8B, v28.8B
+ umull2 v26.8H, v0.16B, v28.16B
+ \add v4.8H, v16.8H, v4.8H
+ srshl v4.8H, v4.8H, v18.8H
+ \add v6.8H, v16.8H, v6.8H
+ srshl v6.8H, v6.8H, v18.8H
+ sqxtun v4.8B, v4.8H
+ sqxtun2 v4.16B, v6.8H
+ \add v24.8H, v16.8H, v24.8H
+ srshl v24.8H, v24.8H, v18.8H
+ \add v26.8H, v16.8H, v26.8H
+ srshl v26.8H, v26.8H, v18.8H
+ sqxtun v24.8B, v24.8H
+ sqxtun2 v24.16B, v26.8H
+ st1 {v4.16B}, [x5], x1
+ st1 {v24.16B}, [x5], x1
+ b.ne 1b
+ ret
+.endm
+
+.macro weight_8 add
+ dup v0.8B, w4
+1: subs w2, w2, #2
+ ld1 {v4.8B}, [x0], x1
+ umull v2.8H, v0.8B, v4.8B
+ ld1 {v6.8B}, [x0], x1
+ umull v20.8H, v0.8B, v6.8B
+ \add v2.8H, v16.8H, v2.8H
+ srshl v2.8H, v2.8H, v18.8H
+ sqxtun v2.8B, v2.8H
+ \add v20.8H, v16.8H, v20.8H
+ srshl v20.8H, v20.8H, v18.8H
+ sqxtun v4.8B, v20.8H
+ st1 {v2.8B}, [x5], x1
+ st1 {v4.8B}, [x5], x1
+ b.ne 1b
+ ret
+.endm
+
+.macro weight_4 add
+ dup v0.8B, w4
+1: subs w2, w2, #4
+ ld1 {v4.S}[0], [x0], x1
+ ld1 {v4.S}[1], [x0], x1
+ umull v2.8H, v0.8B, v4.8B
+ b.lt 2f
+ ld1 {v6.S}[0], [x0], x1
+ ld1 {v6.S}[1], [x0], x1
+ umull v20.8H, v0.8B, v6.8B
+ \add v2.8H, v16.8H, v2.8H
+ srshl v2.8H, v2.8H, v18.8H
+ sqxtun v2.8B, v2.8H
+ \add v20.8H, v16.8H, v20.8H
+ srshl v20.8H, v20.8h, v18.8H
+ sqxtun v4.8B, v20.8H
+ st1 {v2.S}[0], [x5], x1
+ st1 {v2.S}[1], [x5], x1
+ st1 {v4.S}[0], [x5], x1
+ st1 {v4.S}[1], [x5], x1
+ b.ne 1b
+ ret
+2: \add v2.8H, v16.8H, v2.8H
+ srshl v2.8H, v2.8H, v18.8H
+ sqxtun v2.8B, v2.8H
+ st1 {v2.S}[0], [x5], x1
+ st1 {v2.S}[1], [x5], x1
+ ret
+.endm
+
+.macro weight_func w
+function ff_weight_h264_pixels_\w\()_neon, export=1
+ sxtw x1, w1
+ cmp w3, #1
+ mov w6, #1
+ lsl w5, w5, w3
+ dup v16.8H, w5
+ mov x5, x0
+ b.le 20f
+ sub w6, w6, w3
+ dup v18.8H, w6
+ cmp w4, #0
+ b.lt 10f
+ weight_\w shadd
+10: neg w4, w4
+ weight_\w shsub
+20: neg w6, w3
+ dup v18.8H, w6
+ cmp w4, #0
+ b.lt 10f
+ weight_\w add
+10: neg w4, w4
+ weight_\w sub
+endfunc
+.endm
+
+ weight_func 16
+ weight_func 8
+ weight_func 4