aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/Storages/ReadInOrderOptimizer.cpp
blob: e0ef967d4910111d0ab15bf8a0c1303736e301e3 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
#include <Storages/ReadInOrderOptimizer.h>

#include <Interpreters/ExpressionActions.h>
#include <Interpreters/ExpressionAnalyzer.h>
#include <Interpreters/TreeRewriter.h>
#include <Interpreters/replaceAliasColumnsInQuery.h>
#include <Functions/IFunction.h>
#include <Functions/FunctionFactory.h>
#include <Interpreters/TableJoin.h>
#include <Interpreters/Context.h>
#include <Parsers/ASTSelectQuery.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTIdentifier.h>

namespace DB
{

namespace ErrorCodes
{
    extern const int LOGICAL_ERROR;
}

namespace
{

/// Finds expression like x = 'y' or f(x) = 'y',
/// where `x` is identifier, 'y' is literal and `f` is injective functions.
ASTPtr getFixedPoint(const ASTPtr & ast, const ContextPtr & context)
{
    const auto * func = ast->as<ASTFunction>();
    if (!func || func->name != "equals")
        return nullptr;

    if (!func->arguments || func->arguments->children.size() != 2)
        return nullptr;

    const auto & lhs = func->arguments->children[0];
    const auto & rhs = func->arguments->children[1];

    if (!lhs->as<ASTLiteral>() && !rhs->as<ASTLiteral>())
        return nullptr;

    /// Case of two literals doesn't make sense.
    if (lhs->as<ASTLiteral>() && rhs->as<ASTLiteral>())
        return nullptr;

    /// If indetifier is wrapped into injective functions, remove them.
    auto argument = lhs->as<ASTLiteral>() ? rhs : lhs;
    while (const auto * arg_func = argument->as<ASTFunction>())
    {
        if (!arg_func->arguments || arg_func->arguments->children.size() != 1)
            return nullptr;

        auto func_resolver = FunctionFactory::instance().tryGet(arg_func->name, context);
        if (!func_resolver || !func_resolver->isInjective({}))
            return nullptr;

        argument = arg_func->arguments->children[0];
    }

    return argument->as<ASTIdentifier>() ? argument : nullptr;
}

NameSet getFixedSortingColumns(
    const ASTSelectQuery & query, const Names & sorting_key_columns, const ContextPtr & context)
{
    ASTPtr condition;
    if (query.where() && query.prewhere())
        condition = makeASTFunction("and", query.where(), query.prewhere());
    else if (query.where())
        condition = query.where();
    else if (query.prewhere())
        condition = query.prewhere();

    if (!condition)
        return {};

    /// Convert condition to CNF for more convenient analysis.
    auto cnf = TreeCNFConverter::tryConvertToCNF(condition);
    if (!cnf)
        return {};

    NameSet fixed_points;
    NameSet sorting_key_columns_set(sorting_key_columns.begin(), sorting_key_columns.end());

    /// If we met expression like 'column = x', where 'x' is literal,
    /// in clause of size 1 in CNF, then we can guarantee
    /// that in all filtered rows 'column' will be equal to 'x'.
    cnf->iterateGroups([&](const auto & group)
    {
        if (group.size() == 1 && !group.begin()->negative)
        {
            auto fixed_point = getFixedPoint(group.begin()->ast, context);
            if (fixed_point)
            {
                auto column_name = fixed_point->getColumnName();
                if (sorting_key_columns_set.contains(column_name))
                    fixed_points.insert(column_name);
            }
        }
    });

    return fixed_points;
}

struct MatchResult
{
    /// One of {-1, 0, 1} - direction of the match. 0 means - doesn't match.
    int direction = 0;
    /// If true then current key must be the last in the matched prefix of sort description.
    bool is_last_key = false;
};

/// Optimize in case of exact match with order key element
/// or in some simple cases when order key element is wrapped into monotonic function.
MatchResult matchSortDescriptionAndKey(
    const ExpressionActions::Actions & actions,
    const SortColumnDescription & sort_column,
    const String & sorting_key_column)
{
    /// If required order depend on collation, it cannot be matched with primary key order.
    /// Because primary keys cannot have collations.
    if (sort_column.collator)
        return {};

    MatchResult result{sort_column.direction, false};

    /// For the path: order by (sort_column, ...)
    if (sort_column.column_name == sorting_key_column)
        return result;

    /// For the path: order by (function(sort_column), ...)
    /// Allow only one simple monotonic functions with one argument
    /// Why not allow multi monotonic functions?
    bool found_function = false;

    for (const auto & action : actions)
    {
        if (action.node->type != ActionsDAG::ActionType::FUNCTION)
            continue;

        if (found_function)
            return {};

        found_function = true;
        if (action.node->children.size() != 1 || action.node->children.at(0)->result_name != sorting_key_column)
            return {};

        const auto & func = *action.node->function_base;
        if (!func.hasInformationAboutMonotonicity())
            return {};

        auto monotonicity = func.getMonotonicityForRange(*func.getArgumentTypes().at(0), {}, {});
        if (!monotonicity.is_monotonic)
            return {};

        /// If function is not strict monotonic, it can break order
        /// if it's not last in the prefix of sort description.
        /// E.g. if we have ORDER BY (d, u) -- ('2020-01-01', 1), ('2020-01-02', 0), ('2020-01-03', 1)
        /// ORDER BY (toStartOfMonth(d), u) -- ('2020-01-01', 1), ('2020-01-01', 0), ('2020-01-01', 1)
        if (!monotonicity.is_strict)
            result.is_last_key = true;

        if (!monotonicity.is_positive)
            result.direction *= -1;
    }

    if (!found_function)
        return {};

    return result;
}

}

ReadInOrderOptimizer::ReadInOrderOptimizer(
    const ASTSelectQuery & query_,
    const ManyExpressionActions & elements_actions_,
    const SortDescription & required_sort_description_,
    const TreeRewriterResultPtr & syntax_result)
    : elements_actions(elements_actions_)
    , required_sort_description(required_sort_description_)
    , query(query_)
{
    if (elements_actions.size() != required_sort_description.size())
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Sizes of sort description and actions are mismatched");

    /// Do not analyze joined columns.
    /// They may have aliases and come to description as is.
    /// We can mismatch them with order key columns at stage of fetching columns.
    forbidden_columns = syntax_result->getArrayJoinSourceNameSet();

    // array join result columns cannot be used in alias expansion.
    array_join_result_to_source = syntax_result->array_join_result_to_source;
}

InputOrderInfoPtr ReadInOrderOptimizer::getInputOrderImpl(
    const StorageMetadataPtr & metadata_snapshot,
    const SortDescription & description,
    const ManyExpressionActions & actions,
    const ContextPtr & context,
    UInt64 limit) const
{
    const Names & sorting_key_columns = metadata_snapshot->getSortingKeyColumns();
    int read_direction = description.at(0).direction;

    auto fixed_sorting_columns = getFixedSortingColumns(query, sorting_key_columns, context);

    SortDescription sort_description_for_merging;
    sort_description_for_merging.reserve(description.size());

    size_t desc_pos = 0;
    size_t key_pos = 0;

    while (desc_pos < description.size() && key_pos < sorting_key_columns.size())
    {
        if (forbidden_columns.contains(description[desc_pos].column_name))
            break;

        auto match = matchSortDescriptionAndKey(actions[desc_pos]->getActions(), description[desc_pos], sorting_key_columns[key_pos]);
        bool is_matched = match.direction && (desc_pos == 0 || match.direction == read_direction);

        if (!is_matched)
        {
            /// If one of the sorting columns is constant after filtering,
            /// skip it, because it won't affect order anymore.
            if (fixed_sorting_columns.contains(sorting_key_columns[key_pos]))
            {
                ++key_pos;
                continue;
            }

            break;
        }

        if (desc_pos == 0)
            read_direction = match.direction;

        sort_description_for_merging.push_back(description[desc_pos]);

        ++desc_pos;
        ++key_pos;

        if (match.is_last_key)
            break;
    }

    if (sort_description_for_merging.empty())
        return {};

    return std::make_shared<InputOrderInfo>(std::move(sort_description_for_merging), key_pos, read_direction, limit);
}

InputOrderInfoPtr ReadInOrderOptimizer::getInputOrder(
    const StorageMetadataPtr & metadata_snapshot, ContextPtr context, UInt64 limit) const
{
    if (!metadata_snapshot->hasSortingKey())
        return {};

    auto aliased_columns = metadata_snapshot->getColumns().getAliases();

    /// Replace alias column with proper expressions.
    /// Currently we only support alias column without any function wrapper,
    /// i.e.: `order by aliased_column` can have this optimization, but `order by function(aliased_column)` can not.
    /// This suits most cases.
    if (context->getSettingsRef().optimize_respect_aliases && !aliased_columns.empty())
    {
        SortDescription aliases_sort_description = required_sort_description;
        ManyExpressionActions aliases_actions = elements_actions;

        for (size_t i = 0; i < required_sort_description.size(); ++i)
        {
            if (!aliased_columns.contains(required_sort_description[i].column_name))
                continue;

            auto column_expr = metadata_snapshot->getColumns().get(required_sort_description[i].column_name).default_desc.expression->clone();
            replaceAliasColumnsInQuery(column_expr, metadata_snapshot->getColumns(), array_join_result_to_source, context);

            auto syntax_analyzer_result = TreeRewriter(context).analyze(column_expr, metadata_snapshot->getColumns().getAll());
            auto expression_analyzer = ExpressionAnalyzer(column_expr, syntax_analyzer_result, context);

            aliases_sort_description[i].column_name = column_expr->getColumnName();
            aliases_actions[i] = expression_analyzer.getActions(true);
        }

        return getInputOrderImpl(metadata_snapshot, aliases_sort_description, aliases_actions, context, limit);
    }

    return getInputOrderImpl(metadata_snapshot, required_sort_description, elements_actions, context, limit);
}

}