diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2011-12-03 02:08:55 +0100 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2011-12-03 03:00:30 +0100 |
commit | e4de71677f3adeac0f74b89ac8df5d417364df2c (patch) | |
tree | 4792dd8d85d24f0f4eaddabb65f6044727907daa /libavcodec/arm | |
parent | 12804348f5babf56a315fa01751eea1ffdddf98a (diff) | |
parent | d268b79e3436107c11ee8bcdf9f3645368bb3fcd (diff) | |
download | ffmpeg-e4de71677f3adeac0f74b89ac8df5d417364df2c.tar.gz |
Merge remote-tracking branch 'qatar/master'
* qatar/master:
aac_latm: reconfigure decoder on audio specific config changes
latmdec: fix audio specific config parsing
Add avcodec_decode_audio4().
avcodec: change number of plane pointers from 4 to 8 at next major bump.
Update developers documentation with coding conventions.
svq1dec: avoid undefined get_bits(0) call
ARM: h264dsp_neon cosmetics
ARM: make some NEON macros reusable
Do not memcpy raw video frames when using null muxer
fate: update asf seektest
vp8: flush buffers on size changes.
doc: improve general documentation for MacOSX
asf: use packet dts as approximation of pts
asf: do not call av_read_frame
rtsp: Initialize the media_type_mask in the rtp guessing demuxer
Cleaned up alacenc.c
Conflicts:
doc/APIchanges
doc/developer.texi
libavcodec/8svx.c
libavcodec/aacdec.c
libavcodec/ac3dec.c
libavcodec/avcodec.h
libavcodec/nellymoserdec.c
libavcodec/tta.c
libavcodec/utils.c
libavcodec/version.h
libavcodec/wmadec.c
libavformat/asfdec.c
tests/ref/seek/lavf_asf
Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/arm')
-rw-r--r-- | libavcodec/arm/h264dsp_neon.S | 667 | ||||
-rw-r--r-- | libavcodec/arm/neon.S | 59 | ||||
-rw-r--r-- | libavcodec/arm/vp8dsp_neon.S | 26 |
3 files changed, 378 insertions, 374 deletions
diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S index adc21f9500..9f4da2cb7b 100644 --- a/libavcodec/arm/h264dsp_neon.S +++ b/libavcodec/arm/h264dsp_neon.S @@ -19,55 +19,16 @@ */ #include "asm.S" - - .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7 - vtrn.32 \r0, \r4 - vtrn.32 \r1, \r5 - vtrn.32 \r2, \r6 - vtrn.32 \r3, \r7 - vtrn.16 \r0, \r2 - vtrn.16 \r1, \r3 - vtrn.16 \r4, \r6 - vtrn.16 \r5, \r7 - vtrn.8 \r0, \r1 - vtrn.8 \r2, \r3 - vtrn.8 \r4, \r5 - vtrn.8 \r6, \r7 - .endm - - .macro transpose_4x4 r0 r1 r2 r3 - vtrn.16 \r0, \r2 - vtrn.16 \r1, \r3 - vtrn.8 \r0, \r1 - vtrn.8 \r2, \r3 - .endm - - .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7 - vswp \r0, \r4 - vswp \r1, \r5 - vswp \r2, \r6 - vswp \r3, \r7 - .endm - - .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7 - vtrn.32 \r0, \r2 - vtrn.32 \r1, \r3 - vtrn.32 \r4, \r6 - vtrn.32 \r5, \r7 - vtrn.16 \r0, \r1 - vtrn.16 \r2, \r3 - vtrn.16 \r4, \r5 - vtrn.16 \r6, \r7 - .endm +#include "neon.S" /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ - .macro h264_chroma_mc8 type +.macro h264_chroma_mc8 type function ff_\type\()_h264_chroma_mc8_neon, export=1 push {r4-r7, lr} ldrd r4, [sp, #20] -.ifc \type,avg + .ifc \type,avg mov lr, r0 -.endif + .endif pld [r1] pld [r1, r2] @@ -75,7 +36,7 @@ A muls r7, r4, r5 T mul r7, r4, r5 T cmp r7, #0 rsb r6, r7, r5, lsl #3 - rsb ip, r7, r4, lsl #3 + rsb r12, r7, r4, lsl #3 sub r4, r7, r4, lsl #3 sub r4, r4, r5, lsl #3 add r4, r4, #64 @@ -86,10 +47,10 @@ T cmp r7, #0 vdup.8 d0, r4 lsl r4, r2, #1 - vdup.8 d1, ip - vld1.64 {d4, d5}, [r1], r4 + vdup.8 d1, r12 + vld1.8 {d4, d5}, [r1], r4 vdup.8 d2, r6 - vld1.64 {d6, d7}, [r5], r4 + vld1.8 {d6, d7}, [r5], r4 vdup.8 d3, r7 vext.8 d5, d4, d5, #1 @@ -98,7 +59,7 @@ T cmp r7, #0 1: pld [r5] vmull.u8 q8, d4, d0 vmlal.u8 q8, d5, d1 - vld1.64 {d4, d5}, [r1], r4 + vld1.8 {d4, d5}, [r1], r4 vmlal.u8 q8, d6, d2 vext.8 d5, d4, d5, #1 vmlal.u8 q8, d7, d3 @@ -108,57 +69,57 @@ T cmp r7, #0 vmlal.u8 q9, d4, d2 vmlal.u8 q9, d5, d3 vrshrn.u16 d16, q8, #6 - vld1.64 {d6, d7}, [r5], r4 + vld1.8 {d6, d7}, [r5], r4 pld [r1] vrshrn.u16 d17, q9, #6 -.ifc \type,avg - vld1.64 {d20}, [lr,:64], r2 - vld1.64 {d21}, [lr,:64], r2 + .ifc \type,avg + vld1.8 {d20}, [lr,:64], r2 + vld1.8 {d21}, [lr,:64], r2 vrhadd.u8 q8, q8, q10 -.endif + .endif vext.8 d7, d6, d7, #1 - vst1.64 {d16}, [r0,:64], r2 - vst1.64 {d17}, [r0,:64], r2 + vst1.8 {d16}, [r0,:64], r2 + vst1.8 {d17}, [r0,:64], r2 bgt 1b pop {r4-r7, pc} 2: tst r6, r6 - add ip, ip, r6 + add r12, r12, r6 vdup.8 d0, r4 - vdup.8 d1, ip + vdup.8 d1, r12 beq 4f add r5, r1, r2 lsl r4, r2, #1 - vld1.64 {d4}, [r1], r4 - vld1.64 {d6}, [r5], r4 + vld1.8 {d4}, [r1], r4 + vld1.8 {d6}, [r5], r4 3: pld [r5] vmull.u8 q8, d4, d0 vmlal.u8 q8, d6, d1 - vld1.64 {d4}, [r1], r4 + vld1.8 {d4}, [r1], r4 vmull.u8 q9, d6, d0 vmlal.u8 q9, d4, d1 - vld1.64 {d6}, [r5], r4 + vld1.8 {d6}, [r5], r4 vrshrn.u16 d16, q8, #6 vrshrn.u16 d17, q9, #6 -.ifc \type,avg - vld1.64 {d20}, [lr,:64], r2 - vld1.64 {d21}, [lr,:64], r2 + .ifc \type,avg + vld1.8 {d20}, [lr,:64], r2 + vld1.8 {d21}, [lr,:64], r2 vrhadd.u8 q8, q8, q10 -.endif + .endif subs r3, r3, #2 pld [r1] - vst1.64 {d16}, [r0,:64], r2 - vst1.64 {d17}, [r0,:64], r2 + vst1.8 {d16}, [r0,:64], r2 + vst1.8 {d17}, [r0,:64], r2 bgt 3b pop {r4-r7, pc} -4: vld1.64 {d4, d5}, [r1], r2 - vld1.64 {d6, d7}, [r1], r2 +4: vld1.8 {d4, d5}, [r1], r2 + vld1.8 {d6, d7}, [r1], r2 vext.8 d5, d4, d5, #1 vext.8 d7, d6, d7, #1 @@ -166,36 +127,36 @@ T cmp r7, #0 subs r3, r3, #2 vmull.u8 q8, d4, d0 vmlal.u8 q8, d5, d1 - vld1.64 {d4, d5}, [r1], r2 + vld1.8 {d4, d5}, [r1], r2 vmull.u8 q9, d6, d0 vmlal.u8 q9, d7, d1 pld [r1] vext.8 d5, d4, d5, #1 vrshrn.u16 d16, q8, #6 vrshrn.u16 d17, q9, #6 -.ifc \type,avg - vld1.64 {d20}, [lr,:64], r2 - vld1.64 {d21}, [lr,:64], r2 + .ifc \type,avg + vld1.8 {d20}, [lr,:64], r2 + vld1.8 {d21}, [lr,:64], r2 vrhadd.u8 q8, q8, q10 -.endif - vld1.64 {d6, d7}, [r1], r2 + .endif + vld1.8 {d6, d7}, [r1], r2 vext.8 d7, d6, d7, #1 - vst1.64 {d16}, [r0,:64], r2 - vst1.64 {d17}, [r0,:64], r2 + vst1.8 {d16}, [r0,:64], r2 + vst1.8 {d17}, [r0,:64], r2 bgt 5b pop {r4-r7, pc} endfunc - .endm +.endm /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ - .macro h264_chroma_mc4 type +.macro h264_chroma_mc4 type function ff_\type\()_h264_chroma_mc4_neon, export=1 push {r4-r7, lr} ldrd r4, [sp, #20] -.ifc \type,avg + .ifc \type,avg mov lr, r0 -.endif + .endif pld [r1] pld [r1, r2] @@ -203,7 +164,7 @@ A muls r7, r4, r5 T mul r7, r4, r5 T cmp r7, #0 rsb r6, r7, r5, lsl #3 - rsb ip, r7, r4, lsl #3 + rsb r12, r7, r4, lsl #3 sub r4, r7, r4, lsl #3 sub r4, r4, r5, lsl #3 add r4, r4, #64 @@ -214,10 +175,10 @@ T cmp r7, #0 vdup.8 d0, r4 lsl r4, r2, #1 - vdup.8 d1, ip - vld1.64 {d4}, [r1], r4 + vdup.8 d1, r12 + vld1.8 {d4}, [r1], r4 vdup.8 d2, r6 - vld1.64 {d6}, [r5], r4 + vld1.8 {d6}, [r5], r4 vdup.8 d3, r7 vext.8 d5, d4, d5, #1 @@ -231,22 +192,22 @@ T cmp r7, #0 1: pld [r5] vmull.u8 q8, d4, d0 vmlal.u8 q8, d6, d2 - vld1.64 {d4}, [r1], r4 + vld1.8 {d4}, [r1], r4 vext.8 d5, d4, d5, #1 vtrn.32 d4, d5 vmull.u8 q9, d6, d0 vmlal.u8 q9, d4, d2 - vld1.64 {d6}, [r5], r4 + vld1.8 {d6}, [r5], r4 vadd.i16 d16, d16, d17 vadd.i16 d17, d18, d19 vrshrn.u16 d16, q8, #6 subs r3, r3, #2 pld [r1] -.ifc \type,avg + .ifc \type,avg vld1.32 {d20[0]}, [lr,:32], r2 vld1.32 {d20[1]}, [lr,:32], r2 vrhadd.u8 d16, d16, d20 -.endif + .endif vext.8 d7, d6, d7, #1 vtrn.32 d6, d7 vst1.32 {d16[0]}, [r0,:32], r2 @@ -256,9 +217,9 @@ T cmp r7, #0 pop {r4-r7, pc} 2: tst r6, r6 - add ip, ip, r6 + add r12, r12, r6 vdup.8 d0, r4 - vdup.8 d1, ip + vdup.8 d1, r12 vtrn.32 d0, d1 beq 4f @@ -277,11 +238,11 @@ T cmp r7, #0 vadd.i16 d16, d16, d17 vadd.i16 d17, d18, d19 vrshrn.u16 d16, q8, #6 -.ifc \type,avg + .ifc \type,avg vld1.32 {d20[0]}, [lr,:32], r2 vld1.32 {d20[1]}, [lr,:32], r2 vrhadd.u8 d16, d16, d20 -.endif + .endif subs r3, r3, #2 pld [r1] vst1.32 {d16[0]}, [r0,:32], r2 @@ -290,8 +251,8 @@ T cmp r7, #0 pop {r4-r7, pc} -4: vld1.64 {d4}, [r1], r2 - vld1.64 {d6}, [r1], r2 +4: vld1.8 {d4}, [r1], r2 + vld1.8 {d6}, [r1], r2 vext.8 d5, d4, d5, #1 vext.8 d7, d6, d7, #1 vtrn.32 d4, d5 @@ -300,19 +261,19 @@ T cmp r7, #0 5: vmull.u8 q8, d4, d0 vmull.u8 q9, d6, d0 subs r3, r3, #2 - vld1.64 {d4}, [r1], r2 + vld1.8 {d4}, [r1], r2 vext.8 d5, d4, d5, #1 vtrn.32 d4, d5 vadd.i16 d16, d16, d17 vadd.i16 d17, d18, d19 pld [r1] vrshrn.u16 d16, q8, #6 -.ifc \type,avg + .ifc \type,avg vld1.32 {d20[0]}, [lr,:32], r2 vld1.32 {d20[1]}, [lr,:32], r2 vrhadd.u8 d16, d16, d20 -.endif - vld1.64 {d6}, [r1], r2 + .endif + vld1.8 {d6}, [r1], r2 vext.8 d7, d6, d7, #1 vtrn.32 d6, d7 pld [r1] @@ -322,9 +283,9 @@ T cmp r7, #0 pop {r4-r7, pc} endfunc - .endm +.endm - .macro h264_chroma_mc2 type +.macro h264_chroma_mc2 type function ff_\type\()_h264_chroma_mc2_neon, export=1 push {r4-r6, lr} ldr r4, [sp, #16] @@ -354,29 +315,29 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1 vtrn.16 q2, q3 vmull.u8 q8, d4, d0 vmlal.u8 q8, d5, d1 -.ifc \type,avg + .ifc \type,avg vld1.16 {d18[0]}, [r0,:16], r2 vld1.16 {d18[1]}, [r0,:16] sub r0, r0, r2 -.endif + .endif vtrn.32 d16, d17 vadd.i16 d16, d16, d17 vrshrn.u16 d16, q8, #6 -.ifc \type,avg + .ifc \type,avg vrhadd.u8 d16, d16, d18 -.endif + .endif vst1.16 {d16[0]}, [r0,:16], r2 vst1.16 {d16[1]}, [r0,:16], r2 subs r3, r3, #2 bgt 1b pop {r4-r6, pc} 2: -.ifc \type,put + .ifc \type,put ldrh_post r5, r1, r2 strh_post r5, r0, r2 ldrh_post r6, r1, r2 strh_post r6, r0, r2 -.else + .else vld1.16 {d16[0]}, [r1], r2 vld1.16 {d16[1]}, [r1], r2 vld1.16 {d18[0]}, [r0,:16], r2 @@ -385,7 +346,7 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1 vrhadd.u8 d16, d16, d18 vst1.16 {d16[0]}, [r0,:16], r2 vst1.16 {d16[1]}, [r0,:16], r2 -.endif + .endif subs r3, r3, #2 bgt 2b pop {r4-r6, pc} @@ -401,22 +362,22 @@ endfunc /* H.264 loop filter */ - .macro h264_loop_filter_start - ldr ip, [sp] +.macro h264_loop_filter_start + ldr r12, [sp] tst r2, r2 - ldr ip, [ip] + ldr r12, [r12] it ne tstne r3, r3 - vmov.32 d24[0], ip - and ip, ip, ip, lsl #16 + vmov.32 d24[0], r12 + and r12, r12, r12, lsl #16 it eq bxeq lr - ands ip, ip, ip, lsl #8 + ands r12, r12, r12, lsl #8 it lt bxlt lr - .endm +.endm - .macro h264_loop_filter_luma +.macro h264_loop_filter_luma vdup.8 q11, r2 @ alpha vmovl.u8 q12, d24 vabd.u8 q6, q8, q0 @ abs(p0 - q0) @@ -482,29 +443,29 @@ endfunc vqmovun.s16 d17, q6 vqmovun.s16 d0, q11 vqmovun.s16 d1, q12 - .endm +.endm function ff_h264_v_loop_filter_luma_neon, export=1 h264_loop_filter_start - vld1.64 {d0, d1}, [r0,:128], r1 - vld1.64 {d2, d3}, [r0,:128], r1 - vld1.64 {d4, d5}, [r0,:128], r1 + vld1.8 {d0, d1}, [r0,:128], r1 + vld1.8 {d2, d3}, [r0,:128], r1 + vld1.8 {d4, d5}, [r0,:128], r1 sub r0, r0, r1, lsl #2 sub r0, r0, r1, lsl #1 - vld1.64 {d20,d21}, [r0,:128], r1 - vld1.64 {d18,d19}, [r0,:128], r1 - vld1.64 {d16,d17}, [r0,:128], r1 + vld1.8 {d20,d21}, [r0,:128], r1 + vld1.8 {d18,d19}, [r0,:128], r1 + vld1.8 {d16,d17}, [r0,:128], r1 vpush {d8-d15} h264_loop_filter_luma sub r0, r0, r1, lsl #1 - vst1.64 {d8, d9}, [r0,:128], r1 - vst1.64 {d16,d17}, [r0,:128], r1 - vst1.64 {d0, d1}, [r0,:128], r1 - vst1.64 {d10,d11}, [r0,:128] + vst1.8 {d8, d9}, [r0,:128], r1 + vst1.8 {d16,d17}, [r0,:128], r1 + vst1.8 {d0, d1}, [r0,:128], r1 + vst1.8 {d10,d11}, [r0,:128] vpop {d8-d15} bx lr @@ -514,22 +475,22 @@ function ff_h264_h_loop_filter_luma_neon, export=1 h264_loop_filter_start sub r0, r0, #4 - vld1.64 {d6}, [r0], r1 - vld1.64 {d20}, [r0], r1 - vld1.64 {d18}, [r0], r1 - vld1.64 {d16}, [r0], r1 - vld1.64 {d0}, [r0], r1 - vld1.64 {d2}, [r0], r1 - vld1.64 {d4}, [r0], r1 - vld1.64 {d26}, [r0], r1 - vld1.64 {d7}, [r0], r1 - vld1.64 {d21}, [r0], r1 - vld1.64 {d19}, [r0], r1 - vld1.64 {d17}, [r0], r1 - vld1.64 {d1}, [r0], r1 - vld1.64 {d3}, [r0], r1 - vld1.64 {d5}, [r0], r1 - vld1.64 {d27}, [r0], r1 + vld1.8 {d6}, [r0], r1 + vld1.8 {d20}, [r0], r1 + vld1.8 {d18}, [r0], r1 + vld1.8 {d16}, [r0], r1 + vld1.8 {d0}, [r0], r1 + vld1.8 {d2}, [r0], r1 + vld1.8 {d4}, [r0], r1 + vld1.8 {d26}, [r0], r1 + vld1.8 {d7}, [r0], r1 + vld1.8 {d21}, [r0], r1 + vld1.8 {d19}, [r0], r1 + vld1.8 {d17}, [r0], r1 + vld1.8 {d1}, [r0], r1 + vld1.8 {d3}, [r0], r1 + vld1.8 {d5}, [r0], r1 + vld1.8 {d27}, [r0], r1 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13 @@ -562,7 +523,7 @@ function ff_h264_h_loop_filter_luma_neon, export=1 bx lr endfunc - .macro h264_loop_filter_chroma +.macro h264_loop_filter_chroma vdup.8 d22, r2 @ alpha vmovl.u8 q12, d24 vabd.u8 d26, d16, d0 @ abs(p0 - q0) @@ -591,22 +552,22 @@ endfunc vsubw.s8 q11, q11, d4 vqmovun.s16 d16, q14 vqmovun.s16 d0, q11 - .endm +.endm function ff_h264_v_loop_filter_chroma_neon, export=1 h264_loop_filter_start sub r0, r0, r1, lsl #1 - vld1.64 {d18}, [r0,:64], r1 - vld1.64 {d16}, [r0,:64], r1 - vld1.64 {d0}, [r0,:64], r1 - vld1.64 {d2}, [r0,:64] + vld1.8 {d18}, [r0,:64], r1 + vld1.8 {d16}, [r0,:64], r1 + vld1.8 {d0}, [r0,:64], r1 + vld1.8 {d2}, [r0,:64] h264_loop_filter_chroma sub r0, r0, r1, lsl #1 - vst1.64 {d16}, [r0,:64], r1 - vst1.64 {d0}, [r0,:64], r1 + vst1.8 {d16}, [r0,:64], r1 + vst1.8 {d0}, [r0,:64], r1 bx lr endfunc @@ -651,20 +612,20 @@ endfunc /* H.264 qpel MC */ - .macro lowpass_const r +.macro lowpass_const r movw \r, #5 movt \r, #20 vmov.32 d6[0], \r - .endm +.endm - .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 -.if \narrow +.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 + .if \narrow t0 .req q0 t1 .req q8 -.else + .else t0 .req \d0 t1 .req \d1 -.endif + .endif vext.8 d2, \r0, \r1, #2 vext.8 d3, \r0, \r1, #3 vaddl.u8 q1, d2, d3 @@ -685,20 +646,20 @@ endfunc vaddl.u8 t1, \r2, d31 vmla.i16 t1, q9, d6[1] vmls.i16 t1, q10, d6[0] -.if \narrow + .if \narrow vqrshrun.s16 \d0, t0, #5 vqrshrun.s16 \d1, t1, #5 -.endif + .endif .unreq t0 .unreq t1 - .endm +.endm - .macro lowpass_8_1 r0, r1, d0, narrow=1 -.if \narrow +.macro lowpass_8_1 r0, r1, d0, narrow=1 + .if \narrow t0 .req q0 -.else + .else t0 .req \d0 -.endif + .endif vext.8 d2, \r0, \r1, #2 vext.8 d3, \r0, \r1, #3 vaddl.u8 q1, d2, d3 @@ -709,13 +670,13 @@ endfunc vaddl.u8 t0, \r0, d30 vmla.i16 t0, q1, d6[1] vmls.i16 t0, q2, d6[0] -.if \narrow + .if \narrow vqrshrun.s16 \d0, t0, #5 -.endif + .endif .unreq t0 - .endm +.endm - .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d +.macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d vext.16 q1, \r0, \r1, #2 vext.16 q0, \r0, \r1, #3 vaddl.s16 q9, d2, d0 @@ -750,59 +711,59 @@ endfunc vrshrn.s32 d19, q1, #10 vqmovun.s16 \d, q9 - .endm +.endm function put_h264_qpel16_h_lowpass_neon_packed mov r4, lr - mov ip, #16 + mov r12, #16 mov r3, #8 bl put_h264_qpel8_h_lowpass_neon sub r1, r1, r2, lsl #4 add r1, r1, #8 - mov ip, #16 + mov r12, #16 mov lr, r4 b put_h264_qpel8_h_lowpass_neon endfunc - .macro h264_qpel_h_lowpass type +.macro h264_qpel_h_lowpass type function \type\()_h264_qpel16_h_lowpass_neon push {lr} - mov ip, #16 + mov r12, #16 bl \type\()_h264_qpel8_h_lowpass_neon sub r0, r0, r3, lsl #4 sub r1, r1, r2, lsl #4 add r0, r0, #8 add r1, r1, #8 - mov ip, #16 + mov r12, #16 pop {lr} endfunc function \type\()_h264_qpel8_h_lowpass_neon -1: vld1.64 {d0, d1}, [r1], r2 - vld1.64 {d16,d17}, [r1], r2 - subs ip, ip, #2 +1: vld1.8 {d0, d1}, [r1], r2 + vld1.8 {d16,d17}, [r1], r2 + subs r12, r12, #2 lowpass_8 d0, d1, d16, d17, d0, d16 -.ifc \type,avg + .ifc \type,avg vld1.8 {d2}, [r0,:64], r3 vrhadd.u8 d0, d0, d2 vld1.8 {d3}, [r0,:64] vrhadd.u8 d16, d16, d3 sub r0, r0, r3 -.endif - vst1.64 {d0}, [r0,:64], r3 - vst1.64 {d16}, [r0,:64], r3 + .endif + vst1.8 {d0}, [r0,:64], r3 + vst1.8 {d16}, [r0,:64], r3 bne 1b bx lr endfunc - .endm +.endm h264_qpel_h_lowpass put h264_qpel_h_lowpass avg - .macro h264_qpel_h_lowpass_l2 type +.macro h264_qpel_h_lowpass_l2 type function \type\()_h264_qpel16_h_lowpass_l2_neon push {lr} - mov ip, #16 + mov r12, #16 bl \type\()_h264_qpel8_h_lowpass_l2_neon sub r0, r0, r2, lsl #4 sub r1, r1, r2, lsl #4 @@ -810,31 +771,31 @@ function \type\()_h264_qpel16_h_lowpass_l2_neon add r0, r0, #8 add r1, r1, #8 add r3, r3, #8 - mov ip, #16 + mov r12, #16 pop {lr} endfunc function \type\()_h264_qpel8_h_lowpass_l2_neon -1: vld1.64 {d0, d1}, [r1], r2 - vld1.64 {d16,d17}, [r1], r2 - vld1.64 {d28}, [r3], r2 - vld1.64 {d29}, [r3], r2 - subs ip, ip, #2 +1: vld1.8 {d0, d1}, [r1], r2 + vld1.8 {d16,d17}, [r1], r2 + vld1.8 {d28}, [r3], r2 + vld1.8 {d29}, [r3], r2 + subs r12, r12, #2 lowpass_8 d0, d1, d16, d17, d0, d1 vrhadd.u8 q0, q0, q14 -.ifc \type,avg + .ifc \type,avg vld1.8 {d2}, [r0,:64], r2 vrhadd.u8 d0, d0, d2 vld1.8 {d3}, [r0,:64] vrhadd.u8 d1, d1, d3 sub r0, r0, r2 -.endif - vst1.64 {d0}, [r0,:64], r2 - vst1.64 {d1}, [r0,:64], r2 + .endif + vst1.8 {d0}, [r0,:64], r2 + vst1.8 {d1}, [r0,:64], r2 bne 1b bx lr endfunc - .endm +.endm h264_qpel_h_lowpass_l2 put h264_qpel_h_lowpass_l2 avg @@ -854,7 +815,7 @@ function put_h264_qpel16_v_lowpass_neon_packed b put_h264_qpel8_v_lowpass_neon endfunc - .macro h264_qpel_v_lowpass type +.macro h264_qpel_v_lowpass type function \type\()_h264_qpel16_v_lowpass_neon mov r4, lr bl \type\()_h264_qpel8_v_lowpass_neon @@ -871,19 +832,19 @@ function \type\()_h264_qpel16_v_lowpass_neon endfunc function \type\()_h264_qpel8_v_lowpass_neon - vld1.64 {d8}, [r1], r3 - vld1.64 {d10}, [r1], r3 - vld1.64 {d12}, [r1], r3 - vld1.64 {d14}, [r1], r3 - vld1.64 {d22}, [r1], r3 - vld1.64 {d24}, [r1], r3 - vld1.64 {d26}, [r1], r3 - vld1.64 {d28}, [r1], r3 - vld1.64 {d9}, [r1], r3 - vld1.64 {d11}, [r1], r3 - vld1.64 {d13}, [r1], r3 - vld1.64 {d15}, [r1], r3 - vld1.64 {d23}, [r1] + vld1.8 {d8}, [r1], r3 + vld1.8 {d10}, [r1], r3 + vld1.8 {d12}, [r1], r3 + vld1.8 {d14}, [r1], r3 + vld1.8 {d22}, [r1], r3 + vld1.8 {d24}, [r1], r3 + vld1.8 {d26}, [r1], r3 + vld1.8 {d28}, [r1], r3 + vld1.8 {d9}, [r1], r3 + vld1.8 {d11}, [r1], r3 + vld1.8 {d13}, [r1], r3 + vld1.8 {d15}, [r1], r3 + vld1.8 {d23}, [r1] transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 lowpass_8 d8, d9, d10, d11, d8, d10 @@ -892,7 +853,7 @@ function \type\()_h264_qpel8_v_lowpass_neon lowpass_8 d26, d27, d28, d29, d26, d28 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28 -.ifc \type,avg + .ifc \type,avg vld1.8 {d9}, [r0,:64], r2 vrhadd.u8 d8, d8, d9 vld1.8 {d11}, [r0,:64], r2 @@ -910,34 +871,34 @@ function \type\()_h264_qpel8_v_lowpass_neon vld1.8 {d29}, [r0,:64], r2 vrhadd.u8 d28, d28, d29 sub r0, r0, r2, lsl #3 -.endif + .endif - vst1.64 {d8}, [r0,:64], r2 - vst1.64 {d10}, [r0,:64], r2 - vst1.64 {d12}, [r0,:64], r2 - vst1.64 {d14}, [r0,:64], r2 - vst1.64 {d22}, [r0,:64], r2 - vst1.64 {d24}, [r0,:64], r2 - vst1.64 {d26}, [r0,:64], r2 - vst1.64 {d28}, [r0,:64], r2 + vst1.8 {d8}, [r0,:64], r2 + vst1.8 {d10}, [r0,:64], r2 + vst1.8 {d12}, [r0,:64], r2 + vst1.8 {d14}, [r0,:64], r2 + vst1.8 {d22}, [r0,:64], r2 + vst1.8 {d24}, [r0,:64], r2 + vst1.8 {d26}, [r0,:64], r2 + vst1.8 {d28}, [r0,:64], r2 bx lr endfunc - .endm +.endm h264_qpel_v_lowpass put h264_qpel_v_lowpass avg - .macro h264_qpel_v_lowpass_l2 type +.macro h264_qpel_v_lowpass_l2 type function \type\()_h264_qpel16_v_lowpass_l2_neon mov r4, lr bl \type\()_h264_qpel8_v_lowpass_l2_neon sub r1, r1, r3, lsl #2 bl \type\()_h264_qpel8_v_lowpass_l2_neon sub r0, r0, r3, lsl #4 - sub ip, ip, r2, lsl #4 + sub r12, r12, r2, lsl #4 add r0, r0, #8 - add ip, ip, #8 + add r12, r12, #8 sub r1, r1, r3, lsl #4 sub r1, r1, r3, lsl #2 add r1, r1, #8 @@ -947,19 +908,19 @@ function \type\()_h264_qpel16_v_lowpass_l2_neon endfunc function \type\()_h264_qpel8_v_lowpass_l2_neon - vld1.64 {d8}, [r1], r3 - vld1.64 {d10}, [r1], r3 - vld1.64 {d12}, [r1], r3 - vld1.64 {d14}, [r1], r3 - vld1.64 {d22}, [r1], r3 - vld1.64 {d24}, [r1], r3 - vld1.64 {d26}, [r1], r3 - vld1.64 {d28}, [r1], r3 - vld1.64 {d9}, [r1], r3 - vld1.64 {d11}, [r1], r3 - vld1.64 {d13}, [r1], r3 - vld1.64 {d15}, [r1], r3 - vld1.64 {d23}, [r1] + vld1.8 {d8}, [r1], r3 + vld1.8 {d10}, [r1], r3 + vld1.8 {d12}, [r1], r3 + vld1.8 {d14}, [r1], r3 + vld1.8 {d22}, [r1], r3 + vld1.8 {d24}, [r1], r3 + vld1.8 {d26}, [r1], r3 + vld1.8 {d28}, [r1], r3 + vld1.8 {d9}, [r1], r3 + vld1.8 {d11}, [r1], r3 + vld1.8 {d13}, [r1], r3 + vld1.8 {d15}, [r1], r3 + vld1.8 {d23}, [r1] transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 lowpass_8 d8, d9, d10, d11, d8, d9 @@ -968,20 +929,20 @@ function \type\()_h264_qpel8_v_lowpass_l2_neon lowpass_8 d26, d27, d28, d29, d26, d27 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27 - vld1.64 {d0}, [ip], r2 - vld1.64 {d1}, [ip], r2 - vld1.64 {d2}, [ip], r2 - vld1.64 {d3}, [ip], r2 - vld1.64 {d4}, [ip], r2 + vld1.8 {d0}, [r12], r2 + vld1.8 {d1}, [r12], r2 + vld1.8 {d2}, [r12], r2 + vld1.8 {d3}, [r12], r2 + vld1.8 {d4}, [r12], r2 vrhadd.u8 q0, q0, q4 - vld1.64 {d5}, [ip], r2 + vld1.8 {d5}, [r12], r2 vrhadd.u8 q1, q1, q6 - vld1.64 {d10}, [ip], r2 + vld1.8 {d10}, [r12], r2 vrhadd.u8 q2, q2, q11 - vld1.64 {d11}, [ip], r2 + vld1.8 {d11}, [r12], r2 vrhadd.u8 q5, q5, q13 -.ifc \type,avg + .ifc \type,avg vld1.8 {d16}, [r0,:64], r3 vrhadd.u8 d0, d0, d16 vld1.8 {d17}, [r0,:64], r3 @@ -999,51 +960,51 @@ function \type\()_h264_qpel8_v_lowpass_l2_neon vld1.8 {d17}, [r0,:64], r3 vrhadd.u8 d11, d11, d17 sub r0, r0, r3, lsl #3 -.endif + .endif - vst1.64 {d0}, [r0,:64], r3 - vst1.64 {d1}, [r0,:64], r3 - vst1.64 {d2}, [r0,:64], r3 - vst1.64 {d3}, [r0,:64], r3 - vst1.64 {d4}, [r0,:64], r3 - vst1.64 {d5}, [r0,:64], r3 - vst1.64 {d10}, [r0,:64], r3 - vst1.64 {d11}, [r0,:64], r3 + vst1.8 {d0}, [r0,:64], r3 + vst1.8 {d1}, [r0,:64], r3 + vst1.8 {d2}, [r0,:64], r3 + vst1.8 {d3}, [r0,:64], r3 + vst1.8 {d4}, [r0,:64], r3 + vst1.8 {d5}, [r0,:64], r3 + vst1.8 {d10}, [r0,:64], r3 + vst1.8 {d11}, [r0,:64], r3 bx lr endfunc - .endm +.endm h264_qpel_v_lowpass_l2 put h264_qpel_v_lowpass_l2 avg function put_h264_qpel8_hv_lowpass_neon_top - lowpass_const ip - mov ip, #12 -1: vld1.64 {d0, d1}, [r1], r3 - vld1.64 {d16,d17}, [r1], r3 - subs ip, ip, #2 + lowpass_const r12 + mov r12, #12 +1: vld1.8 {d0, d1}, [r1], r3 + vld1.8 {d16,d17}, [r1], r3 + subs r12, r12, #2 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0 - vst1.64 {d22-d25}, [r4,:128]! + vst1.8 {d22-d25}, [r4,:128]! bne 1b - vld1.64 {d0, d1}, [r1] + vld1.8 {d0, d1}, [r1] lowpass_8_1 d0, d1, q12, narrow=0 - mov ip, #-16 - add r4, r4, ip - vld1.64 {d30,d31}, [r4,:128], ip - vld1.64 {d20,d21}, [r4,:128], ip - vld1.64 {d18,d19}, [r4,:128], ip - vld1.64 {d16,d17}, [r4,:128], ip - vld1.64 {d14,d15}, [r4,:128], ip - vld1.64 {d12,d13}, [r4,:128], ip - vld1.64 {d10,d11}, [r4,:128], ip - vld1.64 {d8, d9}, [r4,:128], ip - vld1.64 {d6, d7}, [r4,:128], ip - vld1.64 {d4, d5}, [r4,:128], ip - vld1.64 {d2, d3}, [r4,:128], ip - vld1.64 {d0, d1}, [r4,:128] + mov r12, #-16 + add r4, r4, r12 + vld1.8 {d30,d31}, [r4,:128], r12 + vld1.8 {d20,d21}, [r4,:128], r12 + vld1.8 {d18,d19}, [r4,:128], r12 + vld1.8 {d16,d17}, [r4,:128], r12 + vld1.8 {d14,d15}, [r4,:128], r12 + vld1.8 {d12,d13}, [r4,:128], r12 + vld1.8 {d10,d11}, [r4,:128], r12 + vld1.8 {d8, d9}, [r4,:128], r12 + vld1.8 {d6, d7}, [r4,:128], r12 + vld1.8 {d4, d5}, [r4,:128], r12 + vld1.8 {d2, d3}, [r4,:128], r12 + vld1.8 {d0, d1}, [r4,:128] swap4 d1, d3, d5, d7, d8, d10, d12, d14 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7 @@ -1051,31 +1012,31 @@ function put_h264_qpel8_hv_lowpass_neon_top swap4 d17, d19, d21, d31, d24, d26, d28, d22 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11 - vst1.64 {d30,d31}, [r4,:128]! - vst1.64 {d6, d7}, [r4,:128]! - vst1.64 {d20,d21}, [r4,:128]! - vst1.64 {d4, d5}, [r4,:128]! - vst1.64 {d18,d19}, [r4,:128]! - vst1.64 {d2, d3}, [r4,:128]! - vst1.64 {d16,d17}, [r4,:128]! - vst1.64 {d0, d1}, [r4,:128] + vst1.8 {d30,d31}, [r4,:128]! + vst1.8 {d6, d7}, [r4,:128]! + vst1.8 {d20,d21}, [r4,:128]! + vst1.8 {d4, d5}, [r4,:128]! + vst1.8 {d18,d19}, [r4,:128]! + vst1.8 {d2, d3}, [r4,:128]! + vst1.8 {d16,d17}, [r4,:128]! + vst1.8 {d0, d1}, [r4,:128] lowpass_8.16 q4, q12, d8, d9, d24, d25, d8 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11 - vld1.64 {d16,d17}, [r4,:128], ip - vld1.64 {d30,d31}, [r4,:128], ip + vld1.8 {d16,d17}, [r4,:128], r12 + vld1.8 {d30,d31}, [r4,:128], r12 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12 - vld1.64 {d16,d17}, [r4,:128], ip - vld1.64 {d30,d31}, [r4,:128], ip + vld1.8 {d16,d17}, [r4,:128], r12 + vld1.8 {d30,d31}, [r4,:128], r12 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13 - vld1.64 {d16,d17}, [r4,:128], ip - vld1.64 {d30,d31}, [r4,:128], ip + vld1.8 {d16,d17}, [r4,:128], r12 + vld1.8 {d30,d31}, [r4,:128], r12 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14 - vld1.64 {d16,d17}, [r4,:128], ip - vld1.64 {d30,d31}, [r4,:128] + vld1.8 {d16,d17}, [r4,:128], r12 + vld1.8 {d30,d31}, [r4,:128] lowpass_8.16 q8, q15, d16, d17, d30, d31, d15 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11 @@ -1083,11 +1044,11 @@ function put_h264_qpel8_hv_lowpass_neon_top bx lr endfunc - .macro h264_qpel8_hv_lowpass type +.macro h264_qpel8_hv_lowpass type function \type\()_h264_qpel8_hv_lowpass_neon mov r10, lr bl put_h264_qpel8_hv_lowpass_neon_top -.ifc \type,avg + .ifc \type,avg vld1.8 {d0}, [r0,:64], r2 vrhadd.u8 d12, d12, d0 vld1.8 {d1}, [r0,:64], r2 @@ -1105,39 +1066,39 @@ function \type\()_h264_qpel8_hv_lowpass_neon vld1.8 {d7}, [r0,:64], r2 vrhadd.u8 d11, d11, d7 sub r0, r0, r2, lsl #3 -.endif + .endif - vst1.64 {d12}, [r0,:64], r2 - vst1.64 {d13}, [r0,:64], r2 - vst1.64 {d14}, [r0,:64], r2 - vst1.64 {d15}, [r0,:64], r2 - vst1.64 {d8}, [r0,:64], r2 - vst1.64 {d9}, [r0,:64], r2 - vst1.64 {d10}, [r0,:64], r2 - vst1.64 {d11}, [r0,:64], r2 + vst1.8 {d12}, [r0,:64], r2 + vst1.8 {d13}, [r0,:64], r2 + vst1.8 {d14}, [r0,:64], r2 + vst1.8 {d15}, [r0,:64], r2 + vst1.8 {d8}, [r0,:64], r2 + vst1.8 {d9}, [r0,:64], r2 + vst1.8 {d10}, [r0,:64], r2 + vst1.8 {d11}, [r0,:64], r2 mov lr, r10 bx lr endfunc - .endm +.endm h264_qpel8_hv_lowpass put h264_qpel8_hv_lowpass avg - .macro h264_qpel8_hv_lowpass_l2 type +.macro h264_qpel8_hv_lowpass_l2 type function \type\()_h264_qpel8_hv_lowpass_l2_neon mov r10, lr bl put_h264_qpel8_hv_lowpass_neon_top - vld1.64 {d0, d1}, [r2,:128]! - vld1.64 {d2, d3}, [r2,:128]! + vld1.8 {d0, d1}, [r2,:128]! + vld1.8 {d2, d3}, [r2,:128]! vrhadd.u8 q0, q0, q6 - vld1.64 {d4, d5}, [r2,:128]! + vld1.8 {d4, d5}, [r2,:128]! vrhadd.u8 q1, q1, q7 - vld1.64 {d6, d7}, [r2,:128]! + vld1.8 {d6, d7}, [r2,:128]! vrhadd.u8 q2, q2, q4 vrhadd.u8 q3, q3, q5 -.ifc \type,avg + .ifc \type,avg vld1.8 {d16}, [r0,:64], r3 vrhadd.u8 d0, d0, d16 vld1.8 {d17}, [r0,:64], r3 @@ -1155,25 +1116,25 @@ function \type\()_h264_qpel8_hv_lowpass_l2_neon vld1.8 {d23}, [r0,:64], r3 vrhadd.u8 d7, d7, d23 sub r0, r0, r3, lsl #3 -.endif - vst1.64 {d0}, [r0,:64], r3 - vst1.64 {d1}, [r0,:64], r3 - vst1.64 {d2}, [r0,:64], r3 - vst1.64 {d3}, [r0,:64], r3 - vst1.64 {d4}, [r0,:64], r3 - vst1.64 {d5}, [r0,:64], r3 - vst1.64 {d6}, [r0,:64], r3 - vst1.64 {d7}, [r0,:64], r3 + .endif + vst1.8 {d0}, [r0,:64], r3 + vst1.8 {d1}, [r0,:64], r3 + vst1.8 {d2}, [r0,:64], r3 + vst1.8 {d3}, [r0,:64], r3 + vst1.8 {d4}, [r0,:64], r3 + vst1.8 {d5}, [r0,:64], r3 + vst1.8 {d6}, [r0,:64], r3 + vst1.8 {d7}, [r0,:64], r3 mov lr, r10 bx lr endfunc - .endm +.endm h264_qpel8_hv_lowpass_l2 put h264_qpel8_hv_lowpass_l2 avg - .macro h264_qpel16_hv type +.macro h264_qpel16_hv type function \type\()_h264_qpel16_hv_lowpass_neon mov r9, lr bl \type\()_h264_qpel8_hv_lowpass_neon @@ -1206,17 +1167,17 @@ function \type\()_h264_qpel16_hv_lowpass_l2_neon mov lr, r9 b \type\()_h264_qpel8_hv_lowpass_l2_neon endfunc - .endm +.endm h264_qpel16_hv put h264_qpel16_hv avg - .macro h264_qpel8 type +.macro h264_qpel8 type function ff_\type\()_h264_qpel8_mc10_neon, export=1 lowpass_const r3 mov r3, r1 sub r1, r1, #2 - mov ip, #8 + mov r12, #8 b \type\()_h264_qpel8_h_lowpass_l2_neon endfunc @@ -1224,7 +1185,7 @@ function ff_\type\()_h264_qpel8_mc20_neon, export=1 lowpass_const r3 sub r1, r1, #2 mov r3, r2 - mov ip, #8 + mov r12, #8 b \type\()_h264_qpel8_h_lowpass_neon endfunc @@ -1232,13 +1193,13 @@ function ff_\type\()_h264_qpel8_mc30_neon, export=1 lowpass_const r3 add r3, r1, #1 sub r1, r1, #2 - mov ip, #8 + mov r12, #8 b \type\()_h264_qpel8_h_lowpass_l2_neon endfunc function ff_\type\()_h264_qpel8_mc01_neon, export=1 push {lr} - mov ip, r1 + mov r12, r1 \type\()_h264_qpel8_mc01: lowpass_const r3 mov r3, r2 @@ -1261,12 +1222,12 @@ T mov sp, r0 mov r0, sp sub r1, r1, #2 mov r3, #8 - mov ip, #8 + mov r12, #8 vpush {d8-d15} bl put_h264_qpel8_h_lowpass_neon ldrd r0, [r11], #8 mov r3, r2 - add ip, sp, #64 + add r12, sp, #64 sub r1, r1, r2, lsl #1 mov r2, #8 bl \type\()_h264_qpel8_v_lowpass_l2_neon @@ -1287,7 +1248,7 @@ T mov sp, r0 sub r1, r1, #2 mov r3, #8 mov r0, sp - mov ip, #8 + mov r12, #8 vpush {d8-d15} bl put_h264_qpel8_h_lowpass_neon mov r4, r0 @@ -1372,7 +1333,7 @@ endfunc function ff_\type\()_h264_qpel8_mc03_neon, export=1 push {lr} - add ip, r1, r2 + add r12, r1, r2 b \type\()_h264_qpel8_mc01 endfunc @@ -1395,12 +1356,12 @@ function ff_\type\()_h264_qpel8_mc33_neon, export=1 sub r1, r1, #1 b \type\()_h264_qpel8_mc11 endfunc - .endm +.endm h264_qpel8 put h264_qpel8 avg - .macro h264_qpel16 type +.macro h264_qpel16 type function ff_\type\()_h264_qpel16_mc10_neon, export=1 lowpass_const r3 mov r3, r1 @@ -1424,7 +1385,7 @@ endfunc function ff_\type\()_h264_qpel16_mc01_neon, export=1 push {r4, lr} - mov ip, r1 + mov r12, r1 \type\()_h264_qpel16_mc01: lowpass_const r3 mov r3, r2 @@ -1451,7 +1412,7 @@ T mov sp, r0 bl put_h264_qpel16_h_lowpass_neon ldrd r0, [r11], #8 mov r3, r2 - add ip, sp, #64 + add r12, sp, #64 sub r1, r1, r2, lsl #1 mov r2, #16 bl \type\()_h264_qpel16_v_lowpass_l2_neon @@ -1554,7 +1515,7 @@ endfunc function ff_\type\()_h264_qpel16_mc03_neon, export=1 push {r4, lr} - add ip, r1, r2 + add r12, r1, r2 b \type\()_h264_qpel16_mc01 endfunc @@ -1577,14 +1538,14 @@ function ff_\type\()_h264_qpel16_mc33_neon, export=1 sub r1, r1, #1 b \type\()_h264_qpel16_mc11 endfunc - .endm +.endm h264_qpel16 put h264_qpel16 avg @ Biweighted prediction - .macro biweight_16 macs, macd +.macro biweight_16 macs, macd vdup.8 d0, r4 vdup.8 d1, r5 vmov q2, q8 @@ -1622,9 +1583,9 @@ endfunc vst1.8 {d24-d25},[r6,:128], r2 bne 1b pop {r4-r6, pc} - .endm +.endm - .macro biweight_8 macs, macd +.macro biweight_8 macs, macd vdup.8 d0, r4 vdup.8 d1, r5 vmov q1, q8 @@ -1652,9 +1613,9 @@ endfunc vst1.8 {d4},[r6,:64], r2 bne 1b pop {r4-r6, pc} - .endm +.endm - .macro biweight_4 macs, macd +.macro biweight_4 macs, macd vdup.8 d0, r4 vdup.8 d1, r5 vmov q1, q8 @@ -1694,9 +1655,9 @@ endfunc vst1.32 {d2[0]},[r6,:32], r2 vst1.32 {d2[1]},[r6,:32], r2 pop {r4-r6, pc} - .endm +.endm - .macro biweight_func w +.macro biweight_func w function ff_biweight_h264_pixels_\w\()_neon, export=1 push {r4-r6, lr} ldr r12, [sp, #16] @@ -1726,7 +1687,7 @@ function ff_biweight_h264_pixels_\w\()_neon, export=1 40: rsb r5, r5, #0 biweight_\w vmlsl.u8, vmlal.u8 endfunc - .endm +.endm biweight_func 16 biweight_func 8 @@ -1734,7 +1695,7 @@ endfunc @ Weighted prediction - .macro weight_16 add +.macro weight_16 add vdup.8 d0, r12 1: subs r2, r2, #2 vld1.8 {d20-d21},[r0,:128], r1 @@ -1761,9 +1722,9 @@ endfunc vst1.8 {d24-d25},[r4,:128], r1 bne 1b pop {r4, pc} - .endm +.endm - .macro weight_8 add +.macro weight_8 add vdup.8 d0, r12 1: subs r2, r2, #2 vld1.8 {d4},[r0,:64], r1 @@ -1782,9 +1743,9 @@ endfunc vst1.8 {d4},[r4,:64], r1 bne 1b pop {r4, pc} - .endm +.endm - .macro weight_4 add +.macro weight_4 add vdup.8 d0, r12 vmov q1, q8 vmov q10, q8 @@ -1818,9 +1779,9 @@ endfunc vst1.32 {d2[0]},[r4,:32], r1 vst1.32 {d2[1]},[r4,:32], r1 pop {r4, pc} - .endm +.endm - .macro weight_func w +.macro weight_func w function ff_weight_h264_pixels_\w\()_neon, export=1 push {r4, lr} ldr r12, [sp, #8] @@ -1845,7 +1806,7 @@ function ff_weight_h264_pixels_\w\()_neon, export=1 10: rsb r12, r12, #0 weight_\w vsub.s16 endfunc - .endm +.endm weight_func 16 weight_func 8 diff --git a/libavcodec/arm/neon.S b/libavcodec/arm/neon.S new file mode 100644 index 0000000000..716a607af7 --- /dev/null +++ b/libavcodec/arm/neon.S @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7 + vtrn.32 \r0, \r4 + vtrn.32 \r1, \r5 + vtrn.32 \r2, \r6 + vtrn.32 \r3, \r7 + vtrn.16 \r0, \r2 + vtrn.16 \r1, \r3 + vtrn.16 \r4, \r6 + vtrn.16 \r5, \r7 + vtrn.8 \r0, \r1 + vtrn.8 \r2, \r3 + vtrn.8 \r4, \r5 + vtrn.8 \r6, \r7 +.endm + +.macro transpose_4x4 r0, r1, r2, r3 + vtrn.16 \r0, \r2 + vtrn.16 \r1, \r3 + vtrn.8 \r0, \r1 + vtrn.8 \r2, \r3 +.endm + +.macro swap4 r0, r1, r2, r3, r4, r5, r6, r7 + vswp \r0, \r4 + vswp \r1, \r5 + vswp \r2, \r6 + vswp \r3, \r7 +.endm + +.macro transpose16_4x4 r0, r1, r2, r3, r4, r5, r6, r7 + vtrn.32 \r0, \r2 + vtrn.32 \r1, \r3 + vtrn.32 \r4, \r6 + vtrn.32 \r5, \r7 + vtrn.16 \r0, \r1 + vtrn.16 \r2, \r3 + vtrn.16 \r4, \r5 + vtrn.16 \r6, \r7 +.endm diff --git a/libavcodec/arm/vp8dsp_neon.S b/libavcodec/arm/vp8dsp_neon.S index 1fb3753aab..b4ab1c7032 100644 --- a/libavcodec/arm/vp8dsp_neon.S +++ b/libavcodec/arm/vp8dsp_neon.S @@ -22,6 +22,7 @@ */ #include "asm.S" +#include "neon.S" function ff_vp8_luma_dc_wht_neon, export=1 vld1.16 {q0-q1}, [r1,:128] @@ -442,23 +443,6 @@ endfunc .endif .endm -.macro transpose8x16matrix - vtrn.32 q0, q4 - vtrn.32 q1, q5 - vtrn.32 q2, q6 - vtrn.32 q3, q7 - - vtrn.16 q0, q2 - vtrn.16 q1, q3 - vtrn.16 q4, q6 - vtrn.16 q5, q7 - - vtrn.8 q0, q1 - vtrn.8 q2, q3 - vtrn.8 q4, q5 - vtrn.8 q6, q7 -.endm - .macro vp8_v_loop_filter16 name, inner=0, simple=0 function ff_vp8_v_loop_filter16\name\()_neon, export=1 vpush {q4-q7} @@ -593,7 +577,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1 vld1.8 {d13}, [r0], r1 vld1.8 {d15}, [r0], r1 - transpose8x16matrix + transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 vdup.8 q14, r2 @ flim_E .if !\simple @@ -604,7 +588,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1 sub r0, r0, r1, lsl #4 @ backup 16 rows - transpose8x16matrix + transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 @ Store pixels: vst1.8 {d0}, [r0], r1 @@ -658,7 +642,7 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1 vld1.8 {d14}, [r0], r2 vld1.8 {d15}, [r1], r2 - transpose8x16matrix + transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 vdup.8 q14, r3 @ flim_E vdup.8 q15, r12 @ flim_I @@ -669,7 +653,7 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1 sub r0, r0, r2, lsl #3 @ backup u 8 rows sub r1, r1, r2, lsl #3 @ backup v 8 rows - transpose8x16matrix + transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 @ Store pixels: vst1.8 {d0}, [r0], r2 |