aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/Processors/Formats/Impl/RegexpRowInputFormat.h
blob: 7417d48d8c18f1603679346ea257a9a7e23bf9d5 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#pragma once

#include <re2_st/re2.h>
#include <string>
#include <vector>
#include <Core/Block.h>
#include <IO/PeekableReadBuffer.h>
#include <Processors/Formats/IRowInputFormat.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatSettings.h>
#include <Formats/FormatFactory.h>
#include <Formats/ParsedTemplateFormatString.h>
#include <Formats/SchemaInferenceUtils.h>


namespace DB
{

class ReadBuffer;

/// Class for extracting row fields from data by regexp.
class RegexpFieldExtractor
{
public:
    explicit RegexpFieldExtractor(const FormatSettings & format_settings);

    /// Return true if row was successfully parsed and row fields were extracted.
    bool parseRow(PeekableReadBuffer & buf);

    std::string_view getField(size_t index) { return matched_fields[index]; }
    size_t getMatchedFieldsSize() const { return matched_fields.size(); }
    size_t getNumberOfGroups() const { return regexp.NumberOfCapturingGroups(); }

private:
    const re2_st::RE2 regexp;
    // The vector of fields extracted from line using regexp.
    std::vector<std::string_view> matched_fields;
    // These two vectors are needed to use RE2::FullMatchN (function for extracting fields).
    std::vector<re2_st::RE2::Arg> re2_arguments;
    std::vector<re2_st::RE2::Arg *> re2_arguments_ptrs;
    bool skip_unmatched;
};

/// Regexp input format.
/// This format applies regular expression from format_regexp setting for every line of file
/// (the lines must be separated by newline character ('\n') or DOS-style newline ("\r\n")).
/// Every matched subpattern will be parsed with the method of corresponding data type
/// (according to format_regexp_escaping_rule setting). If the regexp did not match the line,
/// if format_regexp_skip_unmatched is 1, the line is silently skipped, if the setting is 0, exception will be thrown.

class RegexpRowInputFormat final : public IRowInputFormat
{
public:
    RegexpRowInputFormat(ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_);

    String getName() const override { return "RegexpRowInputFormat"; }
    void resetParser() override;
    void setReadBuffer(ReadBuffer & in_) override;

private:
    RegexpRowInputFormat(std::unique_ptr<PeekableReadBuffer> buf_, const Block & header_, Params params_, const FormatSettings & format_settings_);

    using EscapingRule = FormatSettings::EscapingRule;

    bool readRow(MutableColumns & columns, RowReadExtension & ext) override;

    bool readField(size_t index, MutableColumns & columns);
    void readFieldsFromMatch(MutableColumns & columns, RowReadExtension & ext);

    std::unique_ptr<PeekableReadBuffer> buf;
    const FormatSettings format_settings;
    const EscapingRule escaping_rule;
    RegexpFieldExtractor field_extractor;
};

class RegexpSchemaReader : public IRowSchemaReader
{
public:
    RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings);

private:
    std::optional<DataTypes> readRowAndGetDataTypes() override;

    void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override;


    using EscapingRule = FormatSettings::EscapingRule;
    RegexpFieldExtractor field_extractor;
    PeekableReadBuffer buf;
    JSONInferenceInfo json_inference_info;
};

}