aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/robots_txt/robots_txt_parser.cpp
blob: 8e2fe6073d880520a95eed20e60fabe1e92c6fa3 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#include "robots_txt_parser.h"
#include <util/generic/string.h>
#include <util/stream/output.h>

TRobotsTxtParser::TRobotsTxtParser(IInputStream& inputStream)
    : InputStream(inputStream)
    , LineNumber(0)
    , IsLastSymbolCR(false)
{
}

int TRobotsTxtParser::GetLineNumber() {
    return LineNumber;
}

const char* TRobotsTxtParser::ReadLine() {
    Line = "";
    char c;

    if (IsLastSymbolCR) {
        if (!InputStream.ReadChar(c))
            return nullptr;
        if (c != '\n')
            Line.append(c);
    }

    bool hasMoreSymbols;
    while (hasMoreSymbols = InputStream.ReadChar(c)) {
        if (c == '\r') {
            IsLastSymbolCR = true;
            break;
        } else {
            IsLastSymbolCR = false;
            if (c == '\n')
                break;
            Line.append(c);
        }
    }
    if (!hasMoreSymbols && Line.empty())
        return nullptr;

    // BOM UTF-8: EF BB BF
    if (0 == LineNumber && Line.size() >= 3 && Line[0] == '\xEF' && Line[1] == '\xBB' && Line[2] == '\xBF')
        Line = Line.substr(3, Line.size() - 3);

    ++LineNumber;
    int i = Line.find('#');
    if (i == 0)
        Line = "";
    else if (i > 0)
        Line = Line.substr(0, i);
    return Line.data();
}

bool TRobotsTxtParser::IsBlankLine(const char* s) {
    for (const char* p = s; *p; ++p)
        if (!isspace(*p))
            return 0;
    return 1;
}

char* TRobotsTxtParser::Trim(char* s) {
    while (isspace(*s))
        ++s;
    char* p = s + strlen(s) - 1;
    while (s < p && isspace(*p))
        --p;
    *(p + 1) = 0;
    return s;
}

inline bool TRobotsTxtParser::IsRobotsLine(const char* s) {
    return strchr(s, ':');
}

bool TRobotsTxtParser::HasRecord() {
    while (!IsRobotsLine(Line.data()))
        if (!ReadLine())
            return 0;
    return 1;
}

TRobotsTxtRulesRecord TRobotsTxtParser::NextRecord() {
    return TRobotsTxtRulesRecord(*this);
}

TRobotsTxtRulesRecord::TRobotsTxtRulesRecord(TRobotsTxtParser& parser)
    : Parser(parser)
{
}

bool TRobotsTxtRulesRecord::NextPair(TString& field, TString& value, bool handleErrors, TVector<int>& nonRobotsLines, bool* wasBlank) {
    if (wasBlank) {
        *wasBlank = false;
    }
    while (!Parser.IsRobotsLine(Parser.Line.data())) {
        if (!Parser.ReadLine())
            return 0;
        if (Parser.IsBlankLine(Parser.Line.data())) {
            if (wasBlank) {
                *wasBlank = true;
            }
            continue;
        }
        if (handleErrors && !Parser.IsRobotsLine(Parser.Line.data()))
            nonRobotsLines.push_back(Parser.GetLineNumber());
    }

    char* s = strchr(Parser.Line.begin(), ':');
    *s = 0;
    char* p = s + 1;

    field = TRobotsTxtParser::Trim(strlwr(Parser.Line.begin()));
    value = TRobotsTxtParser::Trim(p);
    return 1;
}