aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMåns Rullgård <mans@mansr.com>2010-07-03 18:36:10 +0000
committerMåns Rullgård <mans@mansr.com>2010-07-03 18:36:10 +0000
commita075902f3d98ed33cd5ee0e746978d89999195a3 (patch)
treec91576b90ec185db6532e369669301f13a19355b
parentf054aaf731e5c0cad4967addb59360ecb24b1d47 (diff)
downloadffmpeg-a075902f3d98ed33cd5ee0e746978d89999195a3.tar.gz
PPC: add _interleave versions of fft{4,6,16}_altivec
This removes the need for a post-swizzle with the small FFTs. Originally committed as revision 24025 to svn://svn.ffmpeg.org/ffmpeg/trunk
-rw-r--r--libavcodec/ppc/fft_altivec.c15
-rw-r--r--libavcodec/ppc/fft_altivec_s.S60
2 files changed, 52 insertions, 23 deletions
diff --git a/libavcodec/ppc/fft_altivec.c b/libavcodec/ppc/fft_altivec.c
index b83e047a71..642735477e 100644
--- a/libavcodec/ppc/fft_altivec.c
+++ b/libavcodec/ppc/fft_altivec.c
@@ -38,19 +38,6 @@
extern void *ff_fft_dispatch_altivec[2][15];
#if HAVE_GNU_AS
-// Convert from simd order to C order.
-static void swizzle(vec_f *z, int n)
-{
- int i;
- n >>= 1;
- for (i = 0; i < n; i += 2) {
- vec_f re = z[i];
- vec_f im = z[i+1];
- z[i] = vec_mergeh(re, im);
- z[i+1] = vec_mergel(re, im);
- }
-}
-
static av_always_inline void fft_dispatch(FFTContext *s, FFTComplex *z, int do_swizzle)
{
register vec_f v14 __asm__("v14") = {0,0,0,0};
@@ -84,8 +71,6 @@ static av_always_inline void fft_dispatch(FFTContext *s, FFTComplex *z, int do_s
: "lr","ctr","r0","r4","r5","r6","r7","r8","r9","r10","r11",
"v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13"
);
- if (do_swizzle && s->nbits <= 4)
- swizzle((vec_f*)z, 1<<s->nbits);
}
static void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z)
diff --git a/libavcodec/ppc/fft_altivec_s.S b/libavcodec/ppc/fft_altivec_s.S
index ed7b045e40..4c265c554e 100644
--- a/libavcodec/ppc/fft_altivec_s.S
+++ b/libavcodec/ppc/fft_altivec_s.S
@@ -143,28 +143,53 @@
vaddfp \d0,\s0,\s1
.endm
-fft4_altivec:
+.macro zip d0,d1,s0,s1
+ vmrghw \d0,\s0,\s1
+ vmrglw \d1,\s0,\s1
+.endm
+
+.macro def_fft4 interleave
+fft4\interleave\()_altivec:
lvx v0, 0,r3
lvx v1,r9,r3
FFT4 v0,v1,v2,v3
+.ifnb \interleave
+ zip v0,v1,v2,v3
+ stvx v0, 0,r3
+ stvx v1,r9,r3
+.else
stvx v2, 0,r3
stvx v3,r9,r3
+.endif
blr
+.endm
-fft8_altivec:
+.macro def_fft8 interleave
+fft8\interleave\()_altivec:
addi r4,r3,32
lvx v0, 0,r3
lvx v1,r9,r3
lvx v2, 0,r4
lvx v3,r9,r4
FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8
+.ifnb \interleave
+ zip v4,v5,v0,v1
+ zip v6,v7,v2,v3
+ stvx v4, 0,r3
+ stvx v5,r9,r3
+ stvx v6, 0,r4
+ stvx v7,r9,r4
+.else
stvx v0, 0,r3
stvx v1,r9,r3
stvx v2, 0,r4
stvx v3,r9,r4
+.endif
blr
+.endm
-fft16_altivec:
+.macro def_fft16 interleave
+fft16\interleave\()_altivec:
addi r5,r3,64
addi r6,r3,96
addi r4,r3,32
@@ -190,17 +215,33 @@ fft16_altivec:
BF v11,v13,v9,v11
BF v0,v4,v0,v10
BF v3,v7,v3,v12
+ BF v1,v5,v1,v11
+ BF v2,v6,v2,v13
+.ifnb \interleave
+ zip v8, v9,v0,v1
+ zip v10,v11,v2,v3
+ zip v12,v13,v4,v5
+ zip v14,v15,v6,v7
+ stvx v8, 0,r3
+ stvx v9,r9,r3
+ stvx v10, 0,r4
+ stvx v11,r9,r4
+ stvx v12, 0,r5
+ stvx v13,r9,r5
+ stvx v14, 0,r6
+ stvx v15,r9,r6
+.else
stvx v0, 0,r3
stvx v4, 0,r5
stvx v3,r9,r4
stvx v7,r9,r6
- BF v1,v5,v1,v11
- BF v2,v6,v2,v13
stvx v1,r9,r3
stvx v5,r9,r5
stvx v2, 0,r4
stvx v6, 0,r6
+.endif
blr
+.endm
// void pass(float *z, float *wre, int n)
.macro PASS interleave, suffix
@@ -297,6 +338,9 @@ fft\n\suffix\()_altivec:
.macro DECL_FFTS interleave, suffix
.text
+ def_fft4 \suffix
+ def_fft8 \suffix
+ def_fft16 \suffix
PASS \interleave, \suffix
DECL_FFT \suffix, 5, 32, 16, 8
DECL_FFT \suffix, 6, 64, 32, 16
@@ -314,9 +358,9 @@ fft\n\suffix\()_altivec:
.rodata
.global EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec
EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec:
- PTR fft4_altivec
- PTR fft8_altivec
- PTR fft16_altivec
+ PTR fft4\suffix\()_altivec
+ PTR fft8\suffix\()_altivec
+ PTR fft16\suffix\()_altivec
PTR fft32\suffix\()_altivec
PTR fft64\suffix\()_altivec
PTR fft128\suffix\()_altivec