aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/Functions/randomPrintableASCII.cpp
blob: 33c4c1405fdb44be998c754a5d2506a1ad8096e3 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#include <Functions/IFunction.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Columns/ColumnString.h>
#include <DataTypes/DataTypeString.h>
#include <pcg_random.hpp>
#include <Common/randomSeed.h>


namespace DB
{

namespace ErrorCodes
{
    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
    extern const int TOO_LARGE_STRING_SIZE;
}

namespace
{

/** Generate random string of specified length with printable ASCII characters, almost uniformly distributed.
  * First argument is length, other optional arguments are ignored and used to prevent common subexpression elimination to get different values.
  */
class FunctionRandomPrintableASCII : public IFunction
{
public:
    static constexpr auto name = "randomPrintableASCII";
    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionRandomPrintableASCII>(); }

    String getName() const override
    {
        return name;
    }

    bool isVariadic() const override { return true; }
    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
    size_t getNumberOfArguments() const override { return 0; }

    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
    {
        if (arguments.empty())
            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
                "Function {} requires at least one argument: the size of resulting string", getName());

        if (arguments.size() > 2)
            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
                "Function {} requires at most two arguments: the size of resulting string and optional disambiguation tag", getName());

        const IDataType & length_type = *arguments[0];
        if (!isNumber(length_type))
            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument of function {} must have numeric type", getName());

        return std::make_shared<DataTypeString>();
    }

    bool isDeterministic() const override { return false; }
    bool isDeterministicInScopeOfQuery() const override { return false; }

    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
    {
        auto col_to = ColumnString::create();
        ColumnString::Chars & data_to = col_to->getChars();
        ColumnString::Offsets & offsets_to = col_to->getOffsets();
        offsets_to.resize(input_rows_count);

        pcg64_fast rng(randomSeed());

        const IColumn & length_column = *arguments[0].column;

        IColumn::Offset offset = 0;
        for (size_t row_num = 0; row_num < input_rows_count; ++row_num)
        {
            size_t length = length_column.getUInt(row_num);
            if (length > (1 << 30))
                throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too large string size in function {}", getName());

            IColumn::Offset next_offset = offset + length + 1;
            data_to.resize(next_offset);
            offsets_to[row_num] = next_offset;

            auto * data_to_ptr = data_to.data();    /// avoid assert on array indexing after end
            for (size_t pos = offset, end = offset + length; pos < end; pos += 4)    /// We have padding in column buffers that we can overwrite.
            {
                UInt64 rand = rng();

                UInt16 rand1 = rand;
                UInt16 rand2 = rand >> 16;
                UInt16 rand3 = rand >> 32;
                UInt16 rand4 = rand >> 48;

                /// Printable characters are from range [32; 126].
                /// https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/

                data_to_ptr[pos + 0] = 32 + ((rand1 * 95) >> 16);
                data_to_ptr[pos + 1] = 32 + ((rand2 * 95) >> 16);
                data_to_ptr[pos + 2] = 32 + ((rand3 * 95) >> 16);
                data_to_ptr[pos + 3] = 32 + ((rand4 * 95) >> 16);

                /// NOTE gcc failed to vectorize this code (aliasing of char?)
                /// TODO Implement SIMD optimizations from Danila Kutenin.
            }

            data_to[offset + length] = 0;

            offset = next_offset;
        }

        return col_to;
    }
};

}

REGISTER_FUNCTION(RandomPrintableASCII)
{
    factory.registerFunction<FunctionRandomPrintableASCII>();
}

}