aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/arm
diff options
context:
space:
mode:
authorJustin Ruggles <justin.ruggles@gmail.com>2011-01-30 15:06:46 +0000
committerMans Rullgard <mans@mansr.com>2011-02-02 02:44:53 +0000
commitc73d99e672329c8f2df290736ffc474c360ac4ae (patch)
tree59e330229ee0746b5c466da278430e682fc0371b /libavcodec/arm
parent770c410fbb8e1b87ce8ad7f3d7eddaa55e2b8295 (diff)
downloadffmpeg-c73d99e672329c8f2df290736ffc474c360ac4ae.tar.gz
Separate format conversion DSP functions from DSPContext.
This will be beneficial for use with the audio conversion API without requiring it to depend on all of dsputil. Signed-off-by: Mans Rullgard <mans@mansr.com>
Diffstat (limited to 'libavcodec/arm')
-rw-r--r--libavcodec/arm/Makefile5
-rw-r--r--libavcodec/arm/dsputil_init_neon.c10
-rw-r--r--libavcodec/arm/dsputil_init_vfp.c4
-rw-r--r--libavcodec/arm/dsputil_neon.S365
-rw-r--r--libavcodec/arm/dsputil_vfp.S55
-rw-r--r--libavcodec/arm/fmtconvert_init_arm.c48
-rw-r--r--libavcodec/arm/fmtconvert_neon.S391
-rw-r--r--libavcodec/arm/fmtconvert_vfp.S77
8 files changed, 521 insertions, 434 deletions
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 4c30e0ab9f..014456ee32 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -9,6 +9,7 @@ OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o
OBJS += arm/dsputil_init_arm.o \
arm/dsputil_arm.o \
arm/fft_init_arm.o \
+ arm/fmtconvert_init_arm.o \
arm/jrevdct_arm.o \
arm/mpegvideo_arm.o \
arm/simple_idct_arm.o \
@@ -22,8 +23,11 @@ OBJS-$(HAVE_ARMV6) += arm/dsputil_init_armv6.o \
arm/dsputil_armv6.o \
arm/simple_idct_armv6.o \
+VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o \
+
OBJS-$(HAVE_ARMVFP) += arm/dsputil_vfp.o \
arm/dsputil_init_vfp.o \
+ $(VFP-OBJS-yes)
OBJS-$(HAVE_IWMMXT) += arm/dsputil_iwmmxt.o \
arm/mpegvideo_iwmmxt.o \
@@ -52,6 +56,7 @@ NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp56dsp_neon.o \
OBJS-$(HAVE_NEON) += arm/dsputil_init_neon.o \
arm/dsputil_neon.o \
+ arm/fmtconvert_neon.o \
arm/int_neon.o \
arm/mpegvideo_neon.o \
arm/simple_idct_neon.o \
diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c
index 67982048f9..76ae632273 100644
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@@ -153,8 +153,6 @@ void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul,
int len);
void ff_butterflies_float_neon(float *v1, float *v2, int len);
float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
-void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
- float mul, int len);
void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
const float *src1, int len);
void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
@@ -162,8 +160,6 @@ void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
void ff_vector_clipf_neon(float *dst, const float *src, float min, float max,
int len);
-void ff_float_to_int16_neon(int16_t *, const float *, long);
-void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize);
@@ -308,7 +304,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
c->vector_fmul_scalar = ff_vector_fmul_scalar_neon;
c->butterflies_float = ff_butterflies_float_neon;
c->scalarproduct_float = ff_scalarproduct_float_neon;
- c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
c->vector_fmul_reverse = ff_vector_fmul_reverse_neon;
c->vector_fmul_add = ff_vector_fmul_add_neon;
c->vector_clipf = ff_vector_clipf_neon;
@@ -319,11 +314,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon;
c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon;
- if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
- c->float_to_int16 = ff_float_to_int16_neon;
- c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
- }
-
if (CONFIG_VORBIS_DECODER)
c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;
diff --git a/libavcodec/arm/dsputil_init_vfp.c b/libavcodec/arm/dsputil_init_vfp.c
index 76ef6b4171..bd52315934 100644
--- a/libavcodec/arm/dsputil_init_vfp.c
+++ b/libavcodec/arm/dsputil_init_vfp.c
@@ -25,13 +25,9 @@ void ff_vector_fmul_vfp(float *dst, const float *src0,
const float *src1, int len);
void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
const float *src1, int len);
-void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx)
{
c->vector_fmul = ff_vector_fmul_vfp;
c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
-#if HAVE_ARMV6
- c->float_to_int16 = ff_float_to_int16_vfp;
-#endif
}
diff --git a/libavcodec/arm/dsputil_neon.S b/libavcodec/arm/dsputil_neon.S
index 8329f6cc57..05a911502b 100644
--- a/libavcodec/arm/dsputil_neon.S
+++ b/libavcodec/arm/dsputil_neon.S
@@ -400,343 +400,6 @@ function ff_add_pixels_clamped_neon, export=1
bx lr
endfunc
-function ff_float_to_int16_neon, export=1
- subs r2, r2, #8
- vld1.64 {d0-d1}, [r1,:128]!
- vcvt.s32.f32 q8, q0, #16
- vld1.64 {d2-d3}, [r1,:128]!
- vcvt.s32.f32 q9, q1, #16
- beq 3f
- bics ip, r2, #15
- beq 2f
-1: subs ip, ip, #16
- vshrn.s32 d4, q8, #16
- vld1.64 {d0-d1}, [r1,:128]!
- vcvt.s32.f32 q0, q0, #16
- vshrn.s32 d5, q9, #16
- vld1.64 {d2-d3}, [r1,:128]!
- vcvt.s32.f32 q1, q1, #16
- vshrn.s32 d6, q0, #16
- vst1.64 {d4-d5}, [r0,:128]!
- vshrn.s32 d7, q1, #16
- vld1.64 {d16-d17},[r1,:128]!
- vcvt.s32.f32 q8, q8, #16
- vld1.64 {d18-d19},[r1,:128]!
- vcvt.s32.f32 q9, q9, #16
- vst1.64 {d6-d7}, [r0,:128]!
- bne 1b
- ands r2, r2, #15
- beq 3f
-2: vld1.64 {d0-d1}, [r1,:128]!
- vshrn.s32 d4, q8, #16
- vcvt.s32.f32 q0, q0, #16
- vld1.64 {d2-d3}, [r1,:128]!
- vshrn.s32 d5, q9, #16
- vcvt.s32.f32 q1, q1, #16
- vshrn.s32 d6, q0, #16
- vst1.64 {d4-d5}, [r0,:128]!
- vshrn.s32 d7, q1, #16
- vst1.64 {d6-d7}, [r0,:128]!
- bx lr
-3: vshrn.s32 d4, q8, #16
- vshrn.s32 d5, q9, #16
- vst1.64 {d4-d5}, [r0,:128]!
- bx lr
-endfunc
-
-function ff_float_to_int16_interleave_neon, export=1
- cmp r3, #2
- ldrlt r1, [r1]
- blt ff_float_to_int16_neon
- bne 4f
-
- ldr r3, [r1]
- ldr r1, [r1, #4]
-
- subs r2, r2, #8
- vld1.64 {d0-d1}, [r3,:128]!
- vcvt.s32.f32 q8, q0, #16
- vld1.64 {d2-d3}, [r3,:128]!
- vcvt.s32.f32 q9, q1, #16
- vld1.64 {d20-d21},[r1,:128]!
- vcvt.s32.f32 q10, q10, #16
- vld1.64 {d22-d23},[r1,:128]!
- vcvt.s32.f32 q11, q11, #16
- beq 3f
- bics ip, r2, #15
- beq 2f
-1: subs ip, ip, #16
- vld1.64 {d0-d1}, [r3,:128]!
- vcvt.s32.f32 q0, q0, #16
- vsri.32 q10, q8, #16
- vld1.64 {d2-d3}, [r3,:128]!
- vcvt.s32.f32 q1, q1, #16
- vld1.64 {d24-d25},[r1,:128]!
- vcvt.s32.f32 q12, q12, #16
- vld1.64 {d26-d27},[r1,:128]!
- vsri.32 q11, q9, #16
- vst1.64 {d20-d21},[r0,:128]!
- vcvt.s32.f32 q13, q13, #16
- vst1.64 {d22-d23},[r0,:128]!
- vsri.32 q12, q0, #16
- vld1.64 {d16-d17},[r3,:128]!
- vsri.32 q13, q1, #16
- vst1.64 {d24-d25},[r0,:128]!
- vcvt.s32.f32 q8, q8, #16
- vld1.64 {d18-d19},[r3,:128]!
- vcvt.s32.f32 q9, q9, #16
- vld1.64 {d20-d21},[r1,:128]!
- vcvt.s32.f32 q10, q10, #16
- vld1.64 {d22-d23},[r1,:128]!
- vcvt.s32.f32 q11, q11, #16
- vst1.64 {d26-d27},[r0,:128]!
- bne 1b
- ands r2, r2, #15
- beq 3f
-2: vsri.32 q10, q8, #16
- vld1.64 {d0-d1}, [r3,:128]!
- vcvt.s32.f32 q0, q0, #16
- vld1.64 {d2-d3}, [r3,:128]!
- vcvt.s32.f32 q1, q1, #16
- vld1.64 {d24-d25},[r1,:128]!
- vcvt.s32.f32 q12, q12, #16
- vsri.32 q11, q9, #16
- vld1.64 {d26-d27},[r1,:128]!
- vcvt.s32.f32 q13, q13, #16
- vst1.64 {d20-d21},[r0,:128]!
- vsri.32 q12, q0, #16
- vst1.64 {d22-d23},[r0,:128]!
- vsri.32 q13, q1, #16
- vst1.64 {d24-d27},[r0,:128]!
- bx lr
-3: vsri.32 q10, q8, #16
- vsri.32 q11, q9, #16
- vst1.64 {d20-d23},[r0,:128]!
- bx lr
-
-4: push {r4-r8,lr}
- cmp r3, #4
- lsl ip, r3, #1
- blt 4f
-
- @ 4 channels
-5: ldmia r1!, {r4-r7}
- mov lr, r2
- mov r8, r0
- vld1.64 {d16-d17},[r4,:128]!
- vcvt.s32.f32 q8, q8, #16
- vld1.64 {d18-d19},[r5,:128]!
- vcvt.s32.f32 q9, q9, #16
- vld1.64 {d20-d21},[r6,:128]!
- vcvt.s32.f32 q10, q10, #16
- vld1.64 {d22-d23},[r7,:128]!
- vcvt.s32.f32 q11, q11, #16
-6: subs lr, lr, #8
- vld1.64 {d0-d1}, [r4,:128]!
- vcvt.s32.f32 q0, q0, #16
- vsri.32 q9, q8, #16
- vld1.64 {d2-d3}, [r5,:128]!
- vcvt.s32.f32 q1, q1, #16
- vsri.32 q11, q10, #16
- vld1.64 {d4-d5}, [r6,:128]!
- vcvt.s32.f32 q2, q2, #16
- vzip.32 d18, d22
- vld1.64 {d6-d7}, [r7,:128]!
- vcvt.s32.f32 q3, q3, #16
- vzip.32 d19, d23
- vst1.64 {d18}, [r8], ip
- vsri.32 q1, q0, #16
- vst1.64 {d22}, [r8], ip
- vsri.32 q3, q2, #16
- vst1.64 {d19}, [r8], ip
- vzip.32 d2, d6
- vst1.64 {d23}, [r8], ip
- vzip.32 d3, d7
- beq 7f
- vld1.64 {d16-d17},[r4,:128]!
- vcvt.s32.f32 q8, q8, #16
- vst1.64 {d2}, [r8], ip
- vld1.64 {d18-d19},[r5,:128]!
- vcvt.s32.f32 q9, q9, #16
- vst1.64 {d6}, [r8], ip
- vld1.64 {d20-d21},[r6,:128]!
- vcvt.s32.f32 q10, q10, #16
- vst1.64 {d3}, [r8], ip
- vld1.64 {d22-d23},[r7,:128]!
- vcvt.s32.f32 q11, q11, #16
- vst1.64 {d7}, [r8], ip
- b 6b
-7: vst1.64 {d2}, [r8], ip
- vst1.64 {d6}, [r8], ip
- vst1.64 {d3}, [r8], ip
- vst1.64 {d7}, [r8], ip
- subs r3, r3, #4
- popeq {r4-r8,pc}
- cmp r3, #4
- add r0, r0, #8
- bge 5b
-
- @ 2 channels
-4: cmp r3, #2
- blt 4f
- ldmia r1!, {r4-r5}
- mov lr, r2
- mov r8, r0
- tst lr, #8
- vld1.64 {d16-d17},[r4,:128]!
- vcvt.s32.f32 q8, q8, #16
- vld1.64 {d18-d19},[r5,:128]!
- vcvt.s32.f32 q9, q9, #16
- vld1.64 {d20-d21},[r4,:128]!
- vcvt.s32.f32 q10, q10, #16
- vld1.64 {d22-d23},[r5,:128]!
- vcvt.s32.f32 q11, q11, #16
- beq 6f
- subs lr, lr, #8
- beq 7f
- vsri.32 d18, d16, #16
- vsri.32 d19, d17, #16
- vld1.64 {d16-d17},[r4,:128]!
- vcvt.s32.f32 q8, q8, #16
- vst1.32 {d18[0]}, [r8], ip
- vsri.32 d22, d20, #16
- vst1.32 {d18[1]}, [r8], ip
- vsri.32 d23, d21, #16
- vst1.32 {d19[0]}, [r8], ip
- vst1.32 {d19[1]}, [r8], ip
- vld1.64 {d18-d19},[r5,:128]!
- vcvt.s32.f32 q9, q9, #16
- vst1.32 {d22[0]}, [r8], ip
- vst1.32 {d22[1]}, [r8], ip
- vld1.64 {d20-d21},[r4,:128]!
- vcvt.s32.f32 q10, q10, #16
- vst1.32 {d23[0]}, [r8], ip
- vst1.32 {d23[1]}, [r8], ip
- vld1.64 {d22-d23},[r5,:128]!
- vcvt.s32.f32 q11, q11, #16
-6: subs lr, lr, #16
- vld1.64 {d0-d1}, [r4,:128]!
- vcvt.s32.f32 q0, q0, #16
- vsri.32 d18, d16, #16
- vld1.64 {d2-d3}, [r5,:128]!
- vcvt.s32.f32 q1, q1, #16
- vsri.32 d19, d17, #16
- vld1.64 {d4-d5}, [r4,:128]!
- vcvt.s32.f32 q2, q2, #16
- vld1.64 {d6-d7}, [r5,:128]!
- vcvt.s32.f32 q3, q3, #16
- vst1.32 {d18[0]}, [r8], ip
- vsri.32 d22, d20, #16
- vst1.32 {d18[1]}, [r8], ip
- vsri.32 d23, d21, #16
- vst1.32 {d19[0]}, [r8], ip
- vsri.32 d2, d0, #16
- vst1.32 {d19[1]}, [r8], ip
- vsri.32 d3, d1, #16
- vst1.32 {d22[0]}, [r8], ip
- vsri.32 d6, d4, #16
- vst1.32 {d22[1]}, [r8], ip
- vsri.32 d7, d5, #16
- vst1.32 {d23[0]}, [r8], ip
- vst1.32 {d23[1]}, [r8], ip
- beq 6f
- vld1.64 {d16-d17},[r4,:128]!
- vcvt.s32.f32 q8, q8, #16
- vst1.32 {d2[0]}, [r8], ip
- vst1.32 {d2[1]}, [r8], ip
- vld1.64 {d18-d19},[r5,:128]!
- vcvt.s32.f32 q9, q9, #16
- vst1.32 {d3[0]}, [r8], ip
- vst1.32 {d3[1]}, [r8], ip
- vld1.64 {d20-d21},[r4,:128]!
- vcvt.s32.f32 q10, q10, #16
- vst1.32 {d6[0]}, [r8], ip
- vst1.32 {d6[1]}, [r8], ip
- vld1.64 {d22-d23},[r5,:128]!
- vcvt.s32.f32 q11, q11, #16
- vst1.32 {d7[0]}, [r8], ip
- vst1.32 {d7[1]}, [r8], ip
- bgt 6b
-6: vst1.32 {d2[0]}, [r8], ip
- vst1.32 {d2[1]}, [r8], ip
- vst1.32 {d3[0]}, [r8], ip
- vst1.32 {d3[1]}, [r8], ip
- vst1.32 {d6[0]}, [r8], ip
- vst1.32 {d6[1]}, [r8], ip
- vst1.32 {d7[0]}, [r8], ip
- vst1.32 {d7[1]}, [r8], ip
- b 8f
-7: vsri.32 d18, d16, #16
- vsri.32 d19, d17, #16
- vst1.32 {d18[0]}, [r8], ip
- vsri.32 d22, d20, #16
- vst1.32 {d18[1]}, [r8], ip
- vsri.32 d23, d21, #16
- vst1.32 {d19[0]}, [r8], ip
- vst1.32 {d19[1]}, [r8], ip
- vst1.32 {d22[0]}, [r8], ip
- vst1.32 {d22[1]}, [r8], ip
- vst1.32 {d23[0]}, [r8], ip
- vst1.32 {d23[1]}, [r8], ip
-8: subs r3, r3, #2
- add r0, r0, #4
- popeq {r4-r8,pc}
-
- @ 1 channel
-4: ldr r4, [r1],#4
- tst r2, #8
- mov lr, r2
- mov r5, r0
- vld1.64 {d0-d1}, [r4,:128]!
- vcvt.s32.f32 q0, q0, #16
- vld1.64 {d2-d3}, [r4,:128]!
- vcvt.s32.f32 q1, q1, #16
- bne 8f
-6: subs lr, lr, #16
- vld1.64 {d4-d5}, [r4,:128]!
- vcvt.s32.f32 q2, q2, #16
- vld1.64 {d6-d7}, [r4,:128]!
- vcvt.s32.f32 q3, q3, #16
- vst1.16 {d0[1]}, [r5,:16], ip
- vst1.16 {d0[3]}, [r5,:16], ip
- vst1.16 {d1[1]}, [r5,:16], ip
- vst1.16 {d1[3]}, [r5,:16], ip
- vst1.16 {d2[1]}, [r5,:16], ip
- vst1.16 {d2[3]}, [r5,:16], ip
- vst1.16 {d3[1]}, [r5,:16], ip
- vst1.16 {d3[3]}, [r5,:16], ip
- beq 7f
- vld1.64 {d0-d1}, [r4,:128]!
- vcvt.s32.f32 q0, q0, #16
- vld1.64 {d2-d3}, [r4,:128]!
- vcvt.s32.f32 q1, q1, #16
-7: vst1.16 {d4[1]}, [r5,:16], ip
- vst1.16 {d4[3]}, [r5,:16], ip
- vst1.16 {d5[1]}, [r5,:16], ip
- vst1.16 {d5[3]}, [r5,:16], ip
- vst1.16 {d6[1]}, [r5,:16], ip
- vst1.16 {d6[3]}, [r5,:16], ip
- vst1.16 {d7[1]}, [r5,:16], ip
- vst1.16 {d7[3]}, [r5,:16], ip
- bgt 6b
- pop {r4-r8,pc}
-8: subs lr, lr, #8
- vst1.16 {d0[1]}, [r5,:16], ip
- vst1.16 {d0[3]}, [r5,:16], ip
- vst1.16 {d1[1]}, [r5,:16], ip
- vst1.16 {d1[3]}, [r5,:16], ip
- vst1.16 {d2[1]}, [r5,:16], ip
- vst1.16 {d2[3]}, [r5,:16], ip
- vst1.16 {d3[1]}, [r5,:16], ip
- vst1.16 {d3[3]}, [r5,:16], ip
- popeq {r4-r8,pc}
- vld1.64 {d0-d1}, [r4,:128]!
- vcvt.s32.f32 q0, q0, #16
- vld1.64 {d2-d3}, [r4,:128]!
- vcvt.s32.f32 q1, q1, #16
- b 6b
-endfunc
-
function ff_vector_fmul_neon, export=1
subs r3, r3, #8
vld1.64 {d0-d3}, [r1,:128]!
@@ -1050,34 +713,6 @@ NOVFP vmov.32 r0, d0[0]
bx lr
endfunc
-function ff_int32_to_float_fmul_scalar_neon, export=1
-VFP vdup.32 q0, d0[0]
-VFP len .req r2
-NOVFP vdup.32 q0, r2
-NOVFP len .req r3
-
- vld1.32 {q1},[r1,:128]!
- vcvt.f32.s32 q3, q1
- vld1.32 {q2},[r1,:128]!
- vcvt.f32.s32 q8, q2
-1: subs len, len, #8
- pld [r1, #16]
- vmul.f32 q9, q3, q0
- vmul.f32 q10, q8, q0
- beq 2f
- vld1.32 {q1},[r1,:128]!
- vcvt.f32.s32 q3, q1
- vld1.32 {q2},[r1,:128]!
- vcvt.f32.s32 q8, q2
- vst1.32 {q9}, [r0,:128]!
- vst1.32 {q10},[r0,:128]!
- b 1b
-2: vst1.32 {q9}, [r0,:128]!
- vst1.32 {q10},[r0,:128]!
- bx lr
- .unreq len
-endfunc
-
function ff_vector_fmul_reverse_neon, export=1
add r2, r2, r3, lsl #2
sub r2, r2, #32
diff --git a/libavcodec/arm/dsputil_vfp.S b/libavcodec/arm/dsputil_vfp.S
index a65b69e20a..197d500819 100644
--- a/libavcodec/arm/dsputil_vfp.S
+++ b/libavcodec/arm/dsputil_vfp.S
@@ -131,58 +131,3 @@ function ff_vector_fmul_reverse_vfp, export=1
vpop {d8-d15}
bx lr
endfunc
-
-#if HAVE_ARMV6
-/**
- * ARM VFP optimized float to int16 conversion.
- * Assume that len is a positive number and is multiple of 8, destination
- * buffer is at least 4 bytes aligned (8 bytes alignment is better for
- * performance), little endian byte sex
- */
-@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
-function ff_float_to_int16_vfp, export=1
- push {r4-r8,lr}
- vpush {d8-d11}
- vldmia r1!, {s16-s23}
- vcvt.s32.f32 s0, s16
- vcvt.s32.f32 s1, s17
- vcvt.s32.f32 s2, s18
- vcvt.s32.f32 s3, s19
- vcvt.s32.f32 s4, s20
- vcvt.s32.f32 s5, s21
- vcvt.s32.f32 s6, s22
- vcvt.s32.f32 s7, s23
-1:
- subs r2, r2, #8
- vmov r3, r4, s0, s1
- vmov r5, r6, s2, s3
- vmov r7, r8, s4, s5
- vmov ip, lr, s6, s7
- vldmiagt r1!, {s16-s23}
- ssat r4, #16, r4
- ssat r3, #16, r3
- ssat r6, #16, r6
- ssat r5, #16, r5
- pkhbt r3, r3, r4, lsl #16
- pkhbt r4, r5, r6, lsl #16
- vcvtgt.s32.f32 s0, s16
- vcvtgt.s32.f32 s1, s17
- vcvtgt.s32.f32 s2, s18
- vcvtgt.s32.f32 s3, s19
- vcvtgt.s32.f32 s4, s20
- vcvtgt.s32.f32 s5, s21
- vcvtgt.s32.f32 s6, s22
- vcvtgt.s32.f32 s7, s23
- ssat r8, #16, r8
- ssat r7, #16, r7
- ssat lr, #16, lr
- ssat ip, #16, ip
- pkhbt r5, r7, r8, lsl #16
- pkhbt r6, ip, lr, lsl #16
- stmia r0!, {r3-r6}
- bgt 1b
-
- vpop {d8-d11}
- pop {r4-r8,pc}
-endfunc
-#endif
diff --git a/libavcodec/arm/fmtconvert_init_arm.c b/libavcodec/arm/fmtconvert_init_arm.c
new file mode 100644
index 0000000000..4b6e3939f5
--- /dev/null
+++ b/libavcodec/arm/fmtconvert_init_arm.c
@@ -0,0 +1,48 @@
+/*
+ * ARM optimized Format Conversion Utils
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/fmtconvert.h"
+
+void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
+ float mul, int len);
+
+void ff_float_to_int16_neon(int16_t *dst, const float *src, long len);
+void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
+
+void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
+
+void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx)
+{
+ if (HAVE_ARMVFP && HAVE_ARMV6) {
+ c->float_to_int16 = ff_float_to_int16_vfp;
+ }
+
+ if (HAVE_NEON) {
+ c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
+
+ if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
+ c->float_to_int16 = ff_float_to_int16_neon;
+ c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
+ }
+ }
+}
diff --git a/libavcodec/arm/fmtconvert_neon.S b/libavcodec/arm/fmtconvert_neon.S
new file mode 100644
index 0000000000..359e57e40b
--- /dev/null
+++ b/libavcodec/arm/fmtconvert_neon.S
@@ -0,0 +1,391 @@
+/*
+ * ARM NEON optimised Format Conversion Utils
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "asm.S"
+
+ preserve8
+ .text
+
+function ff_float_to_int16_neon, export=1
+ subs r2, r2, #8
+ vld1.64 {d0-d1}, [r1,:128]!
+ vcvt.s32.f32 q8, q0, #16
+ vld1.64 {d2-d3}, [r1,:128]!
+ vcvt.s32.f32 q9, q1, #16
+ beq 3f
+ bics ip, r2, #15
+ beq 2f
+1: subs ip, ip, #16
+ vshrn.s32 d4, q8, #16
+ vld1.64 {d0-d1}, [r1,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vshrn.s32 d5, q9, #16
+ vld1.64 {d2-d3}, [r1,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ vshrn.s32 d6, q0, #16
+ vst1.64 {d4-d5}, [r0,:128]!
+ vshrn.s32 d7, q1, #16
+ vld1.64 {d16-d17},[r1,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vld1.64 {d18-d19},[r1,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vst1.64 {d6-d7}, [r0,:128]!
+ bne 1b
+ ands r2, r2, #15
+ beq 3f
+2: vld1.64 {d0-d1}, [r1,:128]!
+ vshrn.s32 d4, q8, #16
+ vcvt.s32.f32 q0, q0, #16
+ vld1.64 {d2-d3}, [r1,:128]!
+ vshrn.s32 d5, q9, #16
+ vcvt.s32.f32 q1, q1, #16
+ vshrn.s32 d6, q0, #16
+ vst1.64 {d4-d5}, [r0,:128]!
+ vshrn.s32 d7, q1, #16
+ vst1.64 {d6-d7}, [r0,:128]!
+ bx lr
+3: vshrn.s32 d4, q8, #16
+ vshrn.s32 d5, q9, #16
+ vst1.64 {d4-d5}, [r0,:128]!
+ bx lr
+endfunc
+
+function ff_float_to_int16_interleave_neon, export=1
+ cmp r3, #2
+ ldrlt r1, [r1]
+ blt ff_float_to_int16_neon
+ bne 4f
+
+ ldr r3, [r1]
+ ldr r1, [r1, #4]
+
+ subs r2, r2, #8
+ vld1.64 {d0-d1}, [r3,:128]!
+ vcvt.s32.f32 q8, q0, #16
+ vld1.64 {d2-d3}, [r3,:128]!
+ vcvt.s32.f32 q9, q1, #16
+ vld1.64 {d20-d21},[r1,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vld1.64 {d22-d23},[r1,:128]!
+ vcvt.s32.f32 q11, q11, #16
+ beq 3f
+ bics ip, r2, #15
+ beq 2f
+1: subs ip, ip, #16
+ vld1.64 {d0-d1}, [r3,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vsri.32 q10, q8, #16
+ vld1.64 {d2-d3}, [r3,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ vld1.64 {d24-d25},[r1,:128]!
+ vcvt.s32.f32 q12, q12, #16
+ vld1.64 {d26-d27},[r1,:128]!
+ vsri.32 q11, q9, #16
+ vst1.64 {d20-d21},[r0,:128]!
+ vcvt.s32.f32 q13, q13, #16
+ vst1.64 {d22-d23},[r0,:128]!
+ vsri.32 q12, q0, #16
+ vld1.64 {d16-d17},[r3,:128]!
+ vsri.32 q13, q1, #16
+ vst1.64 {d24-d25},[r0,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vld1.64 {d18-d19},[r3,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vld1.64 {d20-d21},[r1,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vld1.64 {d22-d23},[r1,:128]!
+ vcvt.s32.f32 q11, q11, #16
+ vst1.64 {d26-d27},[r0,:128]!
+ bne 1b
+ ands r2, r2, #15
+ beq 3f
+2: vsri.32 q10, q8, #16
+ vld1.64 {d0-d1}, [r3,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vld1.64 {d2-d3}, [r3,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ vld1.64 {d24-d25},[r1,:128]!
+ vcvt.s32.f32 q12, q12, #16
+ vsri.32 q11, q9, #16
+ vld1.64 {d26-d27},[r1,:128]!
+ vcvt.s32.f32 q13, q13, #16
+ vst1.64 {d20-d21},[r0,:128]!
+ vsri.32 q12, q0, #16
+ vst1.64 {d22-d23},[r0,:128]!
+ vsri.32 q13, q1, #16
+ vst1.64 {d24-d27},[r0,:128]!
+ bx lr
+3: vsri.32 q10, q8, #16
+ vsri.32 q11, q9, #16
+ vst1.64 {d20-d23},[r0,:128]!
+ bx lr
+
+4: push {r4-r8,lr}
+ cmp r3, #4
+ lsl ip, r3, #1
+ blt 4f
+
+ @ 4 channels
+5: ldmia r1!, {r4-r7}
+ mov lr, r2
+ mov r8, r0
+ vld1.64 {d16-d17},[r4,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vld1.64 {d18-d19},[r5,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vld1.64 {d20-d21},[r6,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vld1.64 {d22-d23},[r7,:128]!
+ vcvt.s32.f32 q11, q11, #16
+6: subs lr, lr, #8
+ vld1.64 {d0-d1}, [r4,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vsri.32 q9, q8, #16
+ vld1.64 {d2-d3}, [r5,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ vsri.32 q11, q10, #16
+ vld1.64 {d4-d5}, [r6,:128]!
+ vcvt.s32.f32 q2, q2, #16
+ vzip.32 d18, d22
+ vld1.64 {d6-d7}, [r7,:128]!
+ vcvt.s32.f32 q3, q3, #16
+ vzip.32 d19, d23
+ vst1.64 {d18}, [r8], ip
+ vsri.32 q1, q0, #16
+ vst1.64 {d22}, [r8], ip
+ vsri.32 q3, q2, #16
+ vst1.64 {d19}, [r8], ip
+ vzip.32 d2, d6
+ vst1.64 {d23}, [r8], ip
+ vzip.32 d3, d7
+ beq 7f
+ vld1.64 {d16-d17},[r4,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vst1.64 {d2}, [r8], ip
+ vld1.64 {d18-d19},[r5,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vst1.64 {d6}, [r8], ip
+ vld1.64 {d20-d21},[r6,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vst1.64 {d3}, [r8], ip
+ vld1.64 {d22-d23},[r7,:128]!
+ vcvt.s32.f32 q11, q11, #16
+ vst1.64 {d7}, [r8], ip
+ b 6b
+7: vst1.64 {d2}, [r8], ip
+ vst1.64 {d6}, [r8], ip
+ vst1.64 {d3}, [r8], ip
+ vst1.64 {d7}, [r8], ip
+ subs r3, r3, #4
+ popeq {r4-r8,pc}
+ cmp r3, #4
+ add r0, r0, #8
+ bge 5b
+
+ @ 2 channels
+4: cmp r3, #2
+ blt 4f
+ ldmia r1!, {r4-r5}
+ mov lr, r2
+ mov r8, r0
+ tst lr, #8
+ vld1.64 {d16-d17},[r4,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vld1.64 {d18-d19},[r5,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vld1.64 {d20-d21},[r4,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vld1.64 {d22-d23},[r5,:128]!
+ vcvt.s32.f32 q11, q11, #16
+ beq 6f
+ subs lr, lr, #8
+ beq 7f
+ vsri.32 d18, d16, #16
+ vsri.32 d19, d17, #16
+ vld1.64 {d16-d17},[r4,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vst1.32 {d18[0]}, [r8], ip
+ vsri.32 d22, d20, #16
+ vst1.32 {d18[1]}, [r8], ip
+ vsri.32 d23, d21, #16
+ vst1.32 {d19[0]}, [r8], ip
+ vst1.32 {d19[1]}, [r8], ip
+ vld1.64 {d18-d19},[r5,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vst1.32 {d22[0]}, [r8], ip
+ vst1.32 {d22[1]}, [r8], ip
+ vld1.64 {d20-d21},[r4,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vst1.32 {d23[0]}, [r8], ip
+ vst1.32 {d23[1]}, [r8], ip
+ vld1.64 {d22-d23},[r5,:128]!
+ vcvt.s32.f32 q11, q11, #16
+6: subs lr, lr, #16
+ vld1.64 {d0-d1}, [r4,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vsri.32 d18, d16, #16
+ vld1.64 {d2-d3}, [r5,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ vsri.32 d19, d17, #16
+ vld1.64 {d4-d5}, [r4,:128]!
+ vcvt.s32.f32 q2, q2, #16
+ vld1.64 {d6-d7}, [r5,:128]!
+ vcvt.s32.f32 q3, q3, #16
+ vst1.32 {d18[0]}, [r8], ip
+ vsri.32 d22, d20, #16
+ vst1.32 {d18[1]}, [r8], ip
+ vsri.32 d23, d21, #16
+ vst1.32 {d19[0]}, [r8], ip
+ vsri.32 d2, d0, #16
+ vst1.32 {d19[1]}, [r8], ip
+ vsri.32 d3, d1, #16
+ vst1.32 {d22[0]}, [r8], ip
+ vsri.32 d6, d4, #16
+ vst1.32 {d22[1]}, [r8], ip
+ vsri.32 d7, d5, #16
+ vst1.32 {d23[0]}, [r8], ip
+ vst1.32 {d23[1]}, [r8], ip
+ beq 6f
+ vld1.64 {d16-d17},[r4,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vst1.32 {d2[0]}, [r8], ip
+ vst1.32 {d2[1]}, [r8], ip
+ vld1.64 {d18-d19},[r5,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vst1.32 {d3[0]}, [r8], ip
+ vst1.32 {d3[1]}, [r8], ip
+ vld1.64 {d20-d21},[r4,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vst1.32 {d6[0]}, [r8], ip
+ vst1.32 {d6[1]}, [r8], ip
+ vld1.64 {d22-d23},[r5,:128]!
+ vcvt.s32.f32 q11, q11, #16
+ vst1.32 {d7[0]}, [r8], ip
+ vst1.32 {d7[1]}, [r8], ip
+ bgt 6b
+6: vst1.32 {d2[0]}, [r8], ip
+ vst1.32 {d2[1]}, [r8], ip
+ vst1.32 {d3[0]}, [r8], ip
+ vst1.32 {d3[1]}, [r8], ip
+ vst1.32 {d6[0]}, [r8], ip
+ vst1.32 {d6[1]}, [r8], ip
+ vst1.32 {d7[0]}, [r8], ip
+ vst1.32 {d7[1]}, [r8], ip
+ b 8f
+7: vsri.32 d18, d16, #16
+ vsri.32 d19, d17, #16
+ vst1.32 {d18[0]}, [r8], ip
+ vsri.32 d22, d20, #16
+ vst1.32 {d18[1]}, [r8], ip
+ vsri.32 d23, d21, #16
+ vst1.32 {d19[0]}, [r8], ip
+ vst1.32 {d19[1]}, [r8], ip
+ vst1.32 {d22[0]}, [r8], ip
+ vst1.32 {d22[1]}, [r8], ip
+ vst1.32 {d23[0]}, [r8], ip
+ vst1.32 {d23[1]}, [r8], ip
+8: subs r3, r3, #2
+ add r0, r0, #4
+ popeq {r4-r8,pc}
+
+ @ 1 channel
+4: ldr r4, [r1],#4
+ tst r2, #8
+ mov lr, r2
+ mov r5, r0
+ vld1.64 {d0-d1}, [r4,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vld1.64 {d2-d3}, [r4,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ bne 8f
+6: subs lr, lr, #16
+ vld1.64 {d4-d5}, [r4,:128]!
+ vcvt.s32.f32 q2, q2, #16
+ vld1.64 {d6-d7}, [r4,:128]!
+ vcvt.s32.f32 q3, q3, #16
+ vst1.16 {d0[1]}, [r5,:16], ip
+ vst1.16 {d0[3]}, [r5,:16], ip
+ vst1.16 {d1[1]}, [r5,:16], ip
+ vst1.16 {d1[3]}, [r5,:16], ip
+ vst1.16 {d2[1]}, [r5,:16], ip
+ vst1.16 {d2[3]}, [r5,:16], ip
+ vst1.16 {d3[1]}, [r5,:16], ip
+ vst1.16 {d3[3]}, [r5,:16], ip
+ beq 7f
+ vld1.64 {d0-d1}, [r4,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vld1.64 {d2-d3}, [r4,:128]!
+ vcvt.s32.f32 q1, q1, #16
+7: vst1.16 {d4[1]}, [r5,:16], ip
+ vst1.16 {d4[3]}, [r5,:16], ip
+ vst1.16 {d5[1]}, [r5,:16], ip
+ vst1.16 {d5[3]}, [r5,:16], ip
+ vst1.16 {d6[1]}, [r5,:16], ip
+ vst1.16 {d6[3]}, [r5,:16], ip
+ vst1.16 {d7[1]}, [r5,:16], ip
+ vst1.16 {d7[3]}, [r5,:16], ip
+ bgt 6b
+ pop {r4-r8,pc}
+8: subs lr, lr, #8
+ vst1.16 {d0[1]}, [r5,:16], ip
+ vst1.16 {d0[3]}, [r5,:16], ip
+ vst1.16 {d1[1]}, [r5,:16], ip
+ vst1.16 {d1[3]}, [r5,:16], ip
+ vst1.16 {d2[1]}, [r5,:16], ip
+ vst1.16 {d2[3]}, [r5,:16], ip
+ vst1.16 {d3[1]}, [r5,:16], ip
+ vst1.16 {d3[3]}, [r5,:16], ip
+ popeq {r4-r8,pc}
+ vld1.64 {d0-d1}, [r4,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vld1.64 {d2-d3}, [r4,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ b 6b
+endfunc
+
+function ff_int32_to_float_fmul_scalar_neon, export=1
+VFP vdup.32 q0, d0[0]
+VFP len .req r2
+NOVFP vdup.32 q0, r2
+NOVFP len .req r3
+
+ vld1.32 {q1},[r1,:128]!
+ vcvt.f32.s32 q3, q1
+ vld1.32 {q2},[r1,:128]!
+ vcvt.f32.s32 q8, q2
+1: subs len, len, #8
+ pld [r1, #16]
+ vmul.f32 q9, q3, q0
+ vmul.f32 q10, q8, q0
+ beq 2f
+ vld1.32 {q1},[r1,:128]!
+ vcvt.f32.s32 q3, q1
+ vld1.32 {q2},[r1,:128]!
+ vcvt.f32.s32 q8, q2
+ vst1.32 {q9}, [r0,:128]!
+ vst1.32 {q10},[r0,:128]!
+ b 1b
+2: vst1.32 {q9}, [r0,:128]!
+ vst1.32 {q10},[r0,:128]!
+ bx lr
+ .unreq len
+endfunc
diff --git a/libavcodec/arm/fmtconvert_vfp.S b/libavcodec/arm/fmtconvert_vfp.S
new file mode 100644
index 0000000000..1d19e7758b
--- /dev/null
+++ b/libavcodec/arm/fmtconvert_vfp.S
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "asm.S"
+
+ .syntax unified
+
+/**
+ * ARM VFP optimized float to int16 conversion.
+ * Assume that len is a positive number and is multiple of 8, destination
+ * buffer is at least 4 bytes aligned (8 bytes alignment is better for
+ * performance), little endian byte sex
+ */
+@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
+function ff_float_to_int16_vfp, export=1
+ push {r4-r8,lr}
+ vpush {d8-d11}
+ vldmia r1!, {s16-s23}
+ vcvt.s32.f32 s0, s16
+ vcvt.s32.f32 s1, s17
+ vcvt.s32.f32 s2, s18
+ vcvt.s32.f32 s3, s19
+ vcvt.s32.f32 s4, s20
+ vcvt.s32.f32 s5, s21
+ vcvt.s32.f32 s6, s22
+ vcvt.s32.f32 s7, s23
+1:
+ subs r2, r2, #8
+ vmov r3, r4, s0, s1
+ vmov r5, r6, s2, s3
+ vmov r7, r8, s4, s5
+ vmov ip, lr, s6, s7
+ vldmiagt r1!, {s16-s23}
+ ssat r4, #16, r4
+ ssat r3, #16, r3
+ ssat r6, #16, r6
+ ssat r5, #16, r5
+ pkhbt r3, r3, r4, lsl #16
+ pkhbt r4, r5, r6, lsl #16
+ vcvtgt.s32.f32 s0, s16
+ vcvtgt.s32.f32 s1, s17
+ vcvtgt.s32.f32 s2, s18
+ vcvtgt.s32.f32 s3, s19
+ vcvtgt.s32.f32 s4, s20
+ vcvtgt.s32.f32 s5, s21
+ vcvtgt.s32.f32 s6, s22
+ vcvtgt.s32.f32 s7, s23
+ ssat r8, #16, r8
+ ssat r7, #16, r7
+ ssat lr, #16, lr
+ ssat ip, #16, ip
+ pkhbt r5, r7, r8, lsl #16
+ pkhbt r6, ip, lr, lsl #16
+ stmia r0!, {r3-r6}
+ bgt 1b
+
+ vpop {d8-d11}
+ pop {r4-r8,pc}
+endfunc