aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMåns Rullgård <mans@mansr.com>2008-12-25 23:13:43 +0000
committerMåns Rullgård <mans@mansr.com>2008-12-25 23:13:43 +0000
commit760badc1dfb4e5f4af96398f8ae54977f5c0e4fe (patch)
tree27a305a4eaf065adf191f55f9d637229948c3e00
parent337e3fd990aab76bfcdceb84cd3f62f3e31fa22f (diff)
downloadffmpeg-760badc1dfb4e5f4af96398f8ae54977f5c0e4fe.tar.gz
ARM: add new h264 idct functions
Originally committed as revision 16312 to svn://svn.ffmpeg.org/ffmpeg/trunk
-rw-r--r--libavcodec/arm/dsputil_neon.c12
-rw-r--r--libavcodec/arm/h264idct_neon.S93
2 files changed, 105 insertions, 0 deletions
diff --git a/libavcodec/arm/dsputil_neon.c b/libavcodec/arm/dsputil_neon.c
index 5204c50e37..fd8c73e72a 100644
--- a/libavcodec/arm/dsputil_neon.c
+++ b/libavcodec/arm/dsputil_neon.c
@@ -94,6 +94,15 @@ void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
+void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
+ DCTELEM *block, int stride,
+ const uint8_t nnzc[6*8]);
+void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset,
+ DCTELEM *block, int stride,
+ const uint8_t nnzc[6*8]);
+void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
+ DCTELEM *block, int stride,
+ const uint8_t nnzc[6*8]);
void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
{
@@ -166,4 +175,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
c->h264_idct_add = ff_h264_idct_add_neon;
c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
+ c->h264_idct_add16 = ff_h264_idct_add16_neon;
+ c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
+ c->h264_idct_add8 = ff_h264_idct_add8_neon;
}
diff --git a/libavcodec/arm/h264idct_neon.S b/libavcodec/arm/h264idct_neon.S
index b7ef2f4519..7f1c8eb8d0 100644
--- a/libavcodec/arm/h264idct_neon.S
+++ b/libavcodec/arm/h264idct_neon.S
@@ -20,6 +20,7 @@
#include "asm.S"
+ preserve8
.fpu neon
.text
@@ -94,3 +95,95 @@ function ff_h264_idct_dc_add_neon, export=1
vst1.32 {d1[1]}, [r0,:32], r2
bx lr
.endfunc
+
+function ff_h264_idct_add16_neon, export=1
+ push {r4-r8,lr}
+ mov r4, r0
+ mov r5, r1
+ mov r1, r2
+ mov r2, r3
+ ldr r6, [sp, #24]
+ movw r7, #:lower16:scan8
+ movt r7, #:upper16:scan8
+ mov ip, #16
+1: ldrb r8, [r7], #1
+ ldr r0, [r5], #4
+ ldrb r8, [r6, r8]
+ subs r8, r8, #1
+ blt 2f
+ ldrsh lr, [r1]
+ add r0, r0, r4
+ movne lr, #0
+ cmp lr, #0
+ adrne lr, ff_h264_idct_dc_add_neon
+ adreq lr, ff_h264_idct_add_neon
+ blx lr
+2: subs ip, ip, #1
+ add r1, r1, #32
+ bne 1b
+ pop {r4-r8,pc}
+ .endfunc
+
+function ff_h264_idct_add16intra_neon, export=1
+ push {r4-r8,lr}
+ mov r4, r0
+ mov r5, r1
+ mov r1, r2
+ mov r2, r3
+ ldr r6, [sp, #24]
+ movw r7, #:lower16:scan8
+ movt r7, #:upper16:scan8
+ mov ip, #16
+1: ldrb r8, [r7], #1
+ ldr r0, [r5], #4
+ ldrb r8, [r6, r8]
+ add r0, r0, r4
+ cmp r8, #0
+ ldrsh r8, [r1]
+ adrne lr, ff_h264_idct_add_neon
+ adreq lr, ff_h264_idct_dc_add_neon
+ cmpeq r8, #0
+ blxne lr
+ subs ip, ip, #1
+ add r1, r1, #32
+ bne 1b
+ pop {r4-r8,pc}
+ .endfunc
+
+function ff_h264_idct_add8_neon, export=1
+ push {r4-r10,lr}
+ ldm r0, {r4,r9}
+ add r5, r1, #16*4
+ add r1, r2, #16*32
+ mov r2, r3
+ ldr r6, [sp, #32]
+ movw r7, #:lower16:scan8+16
+ movt r7, #:upper16:scan8+16
+ mov ip, #8
+1: ldrb r8, [r7], #1
+ ldr r0, [r5], #4
+ ldrb r8, [r6, r8]
+ tst ip, #4
+ addeq r0, r0, r4
+ addne r0, r0, r9
+ cmp r8, #0
+ ldrsh r8, [r1]
+ adrne lr, ff_h264_idct_add_neon
+ adreq lr, ff_h264_idct_dc_add_neon
+ cmpeq r8, #0
+ blxne lr
+ subs ip, ip, #1
+ add r1, r1, #32
+ bne 1b
+ pop {r4-r10,pc}
+ .endfunc
+
+ .section .rodata
+scan8: .byte 4+1*8, 5+1*8, 4+2*8, 5+2*8
+ .byte 6+1*8, 7+1*8, 6+2*8, 7+2*8
+ .byte 4+3*8, 5+3*8, 4+4*8, 5+4*8
+ .byte 6+3*8, 7+3*8, 6+4*8, 7+4*8
+ .byte 1+1*8, 2+1*8
+ .byte 1+2*8, 2+2*8
+ .byte 1+4*8, 2+4*8
+ .byte 1+5*8, 2+5*8