libavutil/x86/tx_float_init.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126

/*
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#define TX_FLOAT
#include "libavutil/tx_priv.h"
#include "libavutil/attributes.h"
#include "libavutil/x86/cpu.h"

#include "config.h"

#define DECL_INIT_FN(basis, interleave)                                        \
static av_cold int                                                             \
    ff_tx_fft_sr_codelet_init_b ##basis## _i ##interleave## _x86               \
    (AVTXContext *s,                                                           \
     const FFTXCodelet *cd,                                                    \
     uint64_t flags,                                                           \
     FFTXCodeletOptions *opts,                                                 \
     int len, int inv,                                                         \
     const void *scale)                                                        \
{                                                                              \
    const int inv_lookup = opts ? opts->invert_lookup : 1;                     \
    ff_tx_init_tabs_float(len);                                                \
    return ff_tx_gen_split_radix_parity_revtab(s, inv_lookup,                  \
                                               basis, interleave);             \
}

#define ff_tx_fft_sr_codelet_init_b0_i0_x86 NULL
DECL_INIT_FN(8, 0)
DECL_INIT_FN(8, 2)

#define DECL_SR_CD_DEF(fn_name, len, init_fn, fn_prio, cpu, fn_flags)          \
void ff_tx_ ##fn_name(AVTXContext *s, void *out, void *in, ptrdiff_t stride);  \
static const FFTXCodelet ff_tx_ ##fn_name## _def = {                           \
    .name       = #fn_name,                                                    \
    .function   = ff_tx_ ##fn_name,                                            \
    .type       = TX_TYPE(FFT),                                                \
    .flags      = FF_TX_OUT_OF_PLACE | FF_TX_ALIGNED | fn_flags,               \
    .factors[0] = 2,                                                           \
    .min_len    = len,                                                         \
    .max_len    = len,                                                         \
    .init       = ff_tx_fft_sr_codelet_init_ ##init_fn## _x86,                 \
    .cpu_flags  = AV_CPU_FLAG_ ##cpu,                                          \
    .prio       = fn_prio,                                                     \
};

DECL_SR_CD_DEF(fft2_float_sse3,      2, b0_i0, 128, SSE3, AV_TX_INPLACE)
DECL_SR_CD_DEF(fft4_fwd_float_sse2,  4, b0_i0, 128, SSE2, AV_TX_INPLACE | FF_TX_FORWARD_ONLY)
DECL_SR_CD_DEF(fft4_inv_float_sse2,  4, b0_i0, 128, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY)
DECL_SR_CD_DEF(fft8_float_sse3,      8, b8_i0, 128, SSE3, AV_TX_INPLACE)
DECL_SR_CD_DEF(fft8_float_avx,       8, b8_i0, 256, AVX,  AV_TX_INPLACE)
DECL_SR_CD_DEF(fft16_float_avx,     16, b8_i2, 256, AVX,  AV_TX_INPLACE)
DECL_SR_CD_DEF(fft16_float_fma3,    16, b8_i2, 288, FMA3, AV_TX_INPLACE)

#if ARCH_X86_64
DECL_SR_CD_DEF(fft32_float_avx,     32, b8_i2, 256, AVX,  AV_TX_INPLACE)
DECL_SR_CD_DEF(fft32_float_fma3,    32, b8_i2, 288, FMA3, AV_TX_INPLACE)

void ff_tx_fft_sr_float_avx(AVTXContext *s, void *out, void *in, ptrdiff_t stride);
const FFTXCodelet ff_tx_fft_sr_float_avx_def = {
    .name       = "fft_sr_float_avx",
    .function   = ff_tx_fft_sr_float_avx,
    .type       = TX_TYPE(FFT),
    .flags      = FF_TX_ALIGNED | FF_TX_OUT_OF_PLACE,
    .factors[0] = 2,
    .min_len    = 64,
    .max_len    = 131072,
    .init       = ff_tx_fft_sr_codelet_init_b8_i2_x86,
    .cpu_flags  = AV_CPU_FLAG_AVX,
    .prio       = 256,
};

#if HAVE_AVX2_EXTERNAL
void ff_tx_fft_sr_float_avx2(AVTXContext *s, void *out, void *in, ptrdiff_t stride);
const FFTXCodelet ff_tx_fft_sr_float_avx2_def = {
    .name       = "fft_sr_float_avx2",
    .function   = ff_tx_fft_sr_float_avx2,
    .type       = TX_TYPE(FFT),
    .flags      = FF_TX_ALIGNED | FF_TX_OUT_OF_PLACE,
    .factors[0] = 2,
    .min_len    = 64,
    .max_len    = 131072,
    .init       = ff_tx_fft_sr_codelet_init_b8_i2_x86,
    .cpu_flags  = AV_CPU_FLAG_AVX2,
    .prio       = 288,
};
#endif
#endif

const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
    /* Split-Radix codelets */
    &ff_tx_fft2_float_sse3_def,
    &ff_tx_fft4_fwd_float_sse2_def,
    &ff_tx_fft4_inv_float_sse2_def,
    &ff_tx_fft8_float_sse3_def,
    &ff_tx_fft8_float_avx_def,
    &ff_tx_fft16_float_avx_def,
    &ff_tx_fft16_float_fma3_def,

#if ARCH_X86_64
    &ff_tx_fft32_float_avx_def,
    &ff_tx_fft32_float_fma3_def,

    /* Standalone transforms */
    &ff_tx_fft_sr_float_avx_def,
#if HAVE_AVX2_EXTERNAL
    &ff_tx_fft_sr_float_avx2_def,
#endif
#endif

    NULL,
};