aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/arm
diff options
context:
space:
mode:
authorRonald S. Bultje <rsbultje@gmail.com>2011-10-21 00:00:39 -0700
committerRonald S. Bultje <rsbultje@gmail.com>2011-10-21 01:00:45 -0700
commitc2d337429c7c87ee559efe54dbc0f84f2a25c3a4 (patch)
tree2aae10d3e5c36d3c3c45b9a8970999cc5c1429f6 /libavcodec/arm
parent229d263cc914b5396847f7249fdda2e6ded9ec1b (diff)
downloadffmpeg-c2d337429c7c87ee559efe54dbc0f84f2a25c3a4.tar.gz
H264: change weight/biweight functions to take a height argument.
Neon parts by Mans Rullgard <mans@mansr.com>.
Diffstat (limited to 'libavcodec/arm')
-rw-r--r--libavcodec/arm/h264dsp_init_arm.c77
-rw-r--r--libavcodec/arm/h264dsp_neon.S86
2 files changed, 46 insertions, 117 deletions
diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c
index c1ca217add..1c331a495d 100644
--- a/libavcodec/arm/h264dsp_init_arm.c
+++ b/libavcodec/arm/h264dsp_init_arm.c
@@ -32,47 +32,22 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
-void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den,
- int weight, int offset);
-void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den,
- int weight, int offset);
-void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den,
- int weight, int offset);
-void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den,
- int weight, int offset);
-void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den,
- int weight, int offset);
-void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den,
- int weight, int offset);
-void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den,
- int weight, int offset);
-void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den,
- int weight, int offset);
+void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
+ int log2_den, int weight, int offset);
+void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
+ int log2_den, int weight, int offset);
+void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
+ int log2_den, int weight, int offset);
-void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride,
- int log2_den, int weightd, int weights,
- int offset);
-void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride,
- int log2_den, int weightd, int weights,
- int offset);
-void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride,
- int log2_den, int weightd, int weights,
- int offset);
-void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride,
- int log2_den, int weightd, int weights,
- int offset);
-void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride,
- int log2_den, int weightd, int weights,
- int offset);
-void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride,
- int log2_den, int weightd, int weights,
- int offset);
-void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride,
- int log2_den, int weightd, int weights,
- int offset);
-void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride,
- int log2_den, int weightd, int weights,
- int offset);
+void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
+ int height, int log2_den, int weightd,
+ int weights, int offset);
+void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
+ int height, int log2_den, int weightd,
+ int weights, int offset);
+void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
+ int height, int log2_den, int weightd,
+ int weights, int offset);
void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
@@ -100,23 +75,13 @@ static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth, const i
c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
- c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon;
- c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon;
- c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon;
- c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon;
- c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon;
- c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon;
- c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon;
- c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon;
+ c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
+ c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
+ c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
- c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon;
- c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon;
- c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon;
- c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon;
- c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon;
- c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon;
- c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon;
- c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon;
+ c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
+ c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
+ c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
c->h264_idct_add = ff_h264_idct_add_neon;
c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S
index 0fa4a6b0a5..3d2c6746ae 100644
--- a/libavcodec/arm/h264dsp_neon.S
+++ b/libavcodec/arm/h264dsp_neon.S
@@ -1592,7 +1592,7 @@ endfunc
vdup.8 d1, r5
vmov q2, q8
vmov q3, q8
-1: subs ip, ip, #2
+1: subs r3, r3, #2
vld1.8 {d20-d21},[r0,:128], r2
\macd q2, d0, d20
pld [r0]
@@ -1632,7 +1632,7 @@ endfunc
vdup.8 d1, r5
vmov q1, q8
vmov q10, q8
-1: subs ip, ip, #2
+1: subs r3, r3, #2
vld1.8 {d4},[r0,:64], r2
\macd q1, d0, d4
pld [r0]
@@ -1662,7 +1662,7 @@ endfunc
vdup.8 d1, r5
vmov q1, q8
vmov q10, q8
-1: subs ip, ip, #4
+1: subs r3, r3, #4
vld1.32 {d4[0]},[r0,:32], r2
vld1.32 {d4[1]},[r0,:32], r2
\macd q1, d0, d4
@@ -1700,16 +1700,17 @@ endfunc
.endm
.macro biweight_func w
-function biweight_h264_pixels_\w\()_neon
+function ff_biweight_h264_pixels_\w\()_neon, export=1
push {r4-r6, lr}
- add r4, sp, #16
+ ldr r12, [sp, #16]
+ add r4, sp, #20
ldm r4, {r4-r6}
lsr lr, r4, #31
add r6, r6, #1
eors lr, lr, r5, lsr #30
orr r6, r6, #1
- vdup.16 q9, r3
- lsl r6, r6, r3
+ vdup.16 q9, r12
+ lsl r6, r6, r12
vmvn q9, q9
vdup.16 q8, r6
mov r6, r0
@@ -1730,34 +1731,15 @@ function biweight_h264_pixels_\w\()_neon
endfunc
.endm
- .macro biweight_entry w, h, b=1
-function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
- mov ip, #\h
-.if \b
- b biweight_h264_pixels_\w\()_neon
-.endif
-endfunc
- .endm
-
- biweight_entry 16, 8
- biweight_entry 16, 16, b=0
biweight_func 16
-
- biweight_entry 8, 16
- biweight_entry 8, 4
- biweight_entry 8, 8, b=0
biweight_func 8
-
- biweight_entry 4, 8
- biweight_entry 4, 2
- biweight_entry 4, 4, b=0
biweight_func 4
@ Weighted prediction
.macro weight_16 add
- vdup.8 d0, r3
-1: subs ip, ip, #2
+ vdup.8 d0, r12
+1: subs r2, r2, #2
vld1.8 {d20-d21},[r0,:128], r1
vmull.u8 q2, d0, d20
pld [r0]
@@ -1785,8 +1767,8 @@ endfunc
.endm
.macro weight_8 add
- vdup.8 d0, r3
-1: subs ip, ip, #2
+ vdup.8 d0, r12
+1: subs r2, r2, #2
vld1.8 {d4},[r0,:64], r1
vmull.u8 q1, d0, d4
pld [r0]
@@ -1806,10 +1788,10 @@ endfunc
.endm
.macro weight_4 add
- vdup.8 d0, r3
+ vdup.8 d0, r12
vmov q1, q8
vmov q10, q8
-1: subs ip, ip, #4
+1: subs r2, r2, #4
vld1.32 {d4[0]},[r0,:32], r1
vld1.32 {d4[1]},[r0,:32], r1
vmull.u8 q1, d0, d4
@@ -1842,50 +1824,32 @@ endfunc
.endm
.macro weight_func w
-function weight_h264_pixels_\w\()_neon
+function ff_weight_h264_pixels_\w\()_neon, export=1
push {r4, lr}
- ldr r4, [sp, #8]
- cmp r2, #1
- lsl r4, r4, r2
+ ldr r12, [sp, #8]
+ ldr r4, [sp, #12]
+ cmp r3, #1
+ lsl r4, r4, r3
vdup.16 q8, r4
mov r4, r0
ble 20f
- rsb lr, r2, #1
+ rsb lr, r3, #1
vdup.16 q9, lr
- cmp r3, #0
+ cmp r12, #0
blt 10f
weight_\w vhadd.s16
-10: rsb r3, r3, #0
+10: rsb r12, r12, #0
weight_\w vhsub.s16
-20: rsb lr, r2, #0
+20: rsb lr, r3, #0
vdup.16 q9, lr
- cmp r3, #0
+ cmp r12, #0
blt 10f
weight_\w vadd.s16
-10: rsb r3, r3, #0
+10: rsb r12, r12, #0
weight_\w vsub.s16
endfunc
.endm
- .macro weight_entry w, h, b=1
-function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
- mov ip, #\h
-.if \b
- b weight_h264_pixels_\w\()_neon
-.endif
-endfunc
- .endm
-
- weight_entry 16, 8
- weight_entry 16, 16, b=0
weight_func 16
-
- weight_entry 8, 16
- weight_entry 8, 4
- weight_entry 8, 8, b=0
weight_func 8
-
- weight_entry 4, 8
- weight_entry 4, 2
- weight_entry 4, 4, b=0
weight_func 4