aboutsummaryrefslogtreecommitdiffstats
path: root/libavutil/tx_template.c
diff options
context:
space:
mode:
authorLynne <dev@lynne.ee>2022-11-17 20:10:45 +0100
committerLynne <dev@lynne.ee>2022-11-24 15:58:30 +0100
commit68cabf875015610decda7e564dc5697f6c21f707 (patch)
tree48583892cf2514a2c69418e60c1781e5b1112c9f /libavutil/tx_template.c
parentd4e39cae2e250a6fb9ed3a3a5a93694f4d445165 (diff)
downloadffmpeg-68cabf875015610decda7e564dc5697f6c21f707.tar.gz
lavu/tx: add fft_inplace_small transforms
This is much faster than the loop.
Diffstat (limited to 'libavutil/tx_template.c')
-rw-r--r--libavutil/tx_template.c34
1 files changed, 31 insertions, 3 deletions
diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c
index 5274133ec4..747731a06d 100644
--- a/libavutil/tx_template.c
+++ b/libavutil/tx_template.c
@@ -754,20 +754,34 @@ static av_cold int TX_NAME(ff_tx_fft_init)(AVTXContext *s,
return 0;
}
+static av_cold int TX_NAME(ff_tx_fft_inplace_small_init)(AVTXContext *s,
+ const FFTXCodelet *cd,
+ uint64_t flags,
+ FFTXCodeletOptions *opts,
+ int len, int inv,
+ const void *scale)
+{
+ if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
+ return AVERROR(ENOMEM);
+ flags &= ~AV_TX_INPLACE;
+ return TX_NAME(ff_tx_fft_init)(s, cd, flags, opts, len, inv, scale);
+}
+
static void TX_NAME(ff_tx_fft)(AVTXContext *s, void *_dst,
void *_src, ptrdiff_t stride)
{
TXComplex *src = _src;
- TXComplex *dst = _dst;
+ TXComplex *dst1 = s->flags & AV_TX_INPLACE ? s->tmp : _dst;
+ TXComplex *dst2 = _dst;
int *map = s->sub[0].map;
int len = s->len;
/* Compilers can't vectorize this anyway without assuming AVX2, which they
* generally don't, at least without -march=native -mtune=native */
for (int i = 0; i < len; i++)
- dst[i] = src[map[i]];
+ dst1[i] = src[map[i]];
- s->fn[0](&s->sub[0], dst, dst, stride);
+ s->fn[0](&s->sub[0], dst2, dst1, stride);
}
static void TX_NAME(ff_tx_fft_inplace)(AVTXContext *s, void *_dst,
@@ -807,6 +821,19 @@ static const FFTXCodelet TX_NAME(ff_tx_fft_def) = {
.prio = FF_TX_PRIO_BASE,
};
+static const FFTXCodelet TX_NAME(ff_tx_fft_inplace_small_def) = {
+ .name = TX_NAME_STR("fft_inplace_small"),
+ .function = TX_NAME(ff_tx_fft),
+ .type = TX_TYPE(FFT),
+ .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | AV_TX_INPLACE,
+ .factors[0] = TX_FACTOR_ANY,
+ .min_len = 2,
+ .max_len = 65536,
+ .init = TX_NAME(ff_tx_fft_inplace_small_init),
+ .cpu_flags = FF_TX_CPU_FLAGS_ALL,
+ .prio = FF_TX_PRIO_BASE - 256,
+};
+
static const FFTXCodelet TX_NAME(ff_tx_fft_inplace_def) = {
.name = TX_NAME_STR("fft_inplace"),
.function = TX_NAME(ff_tx_fft_inplace),
@@ -1638,6 +1665,7 @@ const FFTXCodelet * const TX_NAME(ff_tx_codelet_list)[] = {
/* Standalone transforms */
&TX_NAME(ff_tx_fft_def),
&TX_NAME(ff_tx_fft_inplace_def),
+ &TX_NAME(ff_tx_fft_inplace_small_def),
&TX_NAME(ff_tx_fft_pfa_3xM_def),
&TX_NAME(ff_tx_fft_pfa_5xM_def),
&TX_NAME(ff_tx_fft_pfa_7xM_def),