diff options
author | Måns Rullgård <mans@mansr.com> | 2010-07-03 18:36:10 +0000 |
---|---|---|
committer | Måns Rullgård <mans@mansr.com> | 2010-07-03 18:36:10 +0000 |
commit | a075902f3d98ed33cd5ee0e746978d89999195a3 (patch) | |
tree | c91576b90ec185db6532e369669301f13a19355b | |
parent | f054aaf731e5c0cad4967addb59360ecb24b1d47 (diff) | |
download | ffmpeg-a075902f3d98ed33cd5ee0e746978d89999195a3.tar.gz |
PPC: add _interleave versions of fft{4,6,16}_altivec
This removes the need for a post-swizzle with the small FFTs.
Originally committed as revision 24025 to svn://svn.ffmpeg.org/ffmpeg/trunk
-rw-r--r-- | libavcodec/ppc/fft_altivec.c | 15 | ||||
-rw-r--r-- | libavcodec/ppc/fft_altivec_s.S | 60 |
2 files changed, 52 insertions, 23 deletions
diff --git a/libavcodec/ppc/fft_altivec.c b/libavcodec/ppc/fft_altivec.c index b83e047a71..642735477e 100644 --- a/libavcodec/ppc/fft_altivec.c +++ b/libavcodec/ppc/fft_altivec.c @@ -38,19 +38,6 @@ extern void *ff_fft_dispatch_altivec[2][15]; #if HAVE_GNU_AS -// Convert from simd order to C order. -static void swizzle(vec_f *z, int n) -{ - int i; - n >>= 1; - for (i = 0; i < n; i += 2) { - vec_f re = z[i]; - vec_f im = z[i+1]; - z[i] = vec_mergeh(re, im); - z[i+1] = vec_mergel(re, im); - } -} - static av_always_inline void fft_dispatch(FFTContext *s, FFTComplex *z, int do_swizzle) { register vec_f v14 __asm__("v14") = {0,0,0,0}; @@ -84,8 +71,6 @@ static av_always_inline void fft_dispatch(FFTContext *s, FFTComplex *z, int do_s : "lr","ctr","r0","r4","r5","r6","r7","r8","r9","r10","r11", "v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13" ); - if (do_swizzle && s->nbits <= 4) - swizzle((vec_f*)z, 1<<s->nbits); } static void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z) diff --git a/libavcodec/ppc/fft_altivec_s.S b/libavcodec/ppc/fft_altivec_s.S index ed7b045e40..4c265c554e 100644 --- a/libavcodec/ppc/fft_altivec_s.S +++ b/libavcodec/ppc/fft_altivec_s.S @@ -143,28 +143,53 @@ vaddfp \d0,\s0,\s1 .endm -fft4_altivec: +.macro zip d0,d1,s0,s1 + vmrghw \d0,\s0,\s1 + vmrglw \d1,\s0,\s1 +.endm + +.macro def_fft4 interleave +fft4\interleave\()_altivec: lvx v0, 0,r3 lvx v1,r9,r3 FFT4 v0,v1,v2,v3 +.ifnb \interleave + zip v0,v1,v2,v3 + stvx v0, 0,r3 + stvx v1,r9,r3 +.else stvx v2, 0,r3 stvx v3,r9,r3 +.endif blr +.endm -fft8_altivec: +.macro def_fft8 interleave +fft8\interleave\()_altivec: addi r4,r3,32 lvx v0, 0,r3 lvx v1,r9,r3 lvx v2, 0,r4 lvx v3,r9,r4 FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8 +.ifnb \interleave + zip v4,v5,v0,v1 + zip v6,v7,v2,v3 + stvx v4, 0,r3 + stvx v5,r9,r3 + stvx v6, 0,r4 + stvx v7,r9,r4 +.else stvx v0, 0,r3 stvx v1,r9,r3 stvx v2, 0,r4 stvx v3,r9,r4 +.endif blr +.endm -fft16_altivec: +.macro def_fft16 interleave +fft16\interleave\()_altivec: addi r5,r3,64 addi r6,r3,96 addi r4,r3,32 @@ -190,17 +215,33 @@ fft16_altivec: BF v11,v13,v9,v11 BF v0,v4,v0,v10 BF v3,v7,v3,v12 + BF v1,v5,v1,v11 + BF v2,v6,v2,v13 +.ifnb \interleave + zip v8, v9,v0,v1 + zip v10,v11,v2,v3 + zip v12,v13,v4,v5 + zip v14,v15,v6,v7 + stvx v8, 0,r3 + stvx v9,r9,r3 + stvx v10, 0,r4 + stvx v11,r9,r4 + stvx v12, 0,r5 + stvx v13,r9,r5 + stvx v14, 0,r6 + stvx v15,r9,r6 +.else stvx v0, 0,r3 stvx v4, 0,r5 stvx v3,r9,r4 stvx v7,r9,r6 - BF v1,v5,v1,v11 - BF v2,v6,v2,v13 stvx v1,r9,r3 stvx v5,r9,r5 stvx v2, 0,r4 stvx v6, 0,r6 +.endif blr +.endm // void pass(float *z, float *wre, int n) .macro PASS interleave, suffix @@ -297,6 +338,9 @@ fft\n\suffix\()_altivec: .macro DECL_FFTS interleave, suffix .text + def_fft4 \suffix + def_fft8 \suffix + def_fft16 \suffix PASS \interleave, \suffix DECL_FFT \suffix, 5, 32, 16, 8 DECL_FFT \suffix, 6, 64, 32, 16 @@ -314,9 +358,9 @@ fft\n\suffix\()_altivec: .rodata .global EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec: - PTR fft4_altivec - PTR fft8_altivec - PTR fft16_altivec + PTR fft4\suffix\()_altivec + PTR fft8\suffix\()_altivec + PTR fft16\suffix\()_altivec PTR fft32\suffix\()_altivec PTR fft64\suffix\()_altivec PTR fft128\suffix\()_altivec |