blob: 8e2fe6073d880520a95eed20e60fabe1e92c6fa3 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
|
#include "robots_txt_parser.h"
#include <util/generic/string.h>
#include <util/stream/output.h>
TRobotsTxtParser::TRobotsTxtParser(IInputStream& inputStream)
: InputStream(inputStream)
, LineNumber(0)
, IsLastSymbolCR(false)
{
}
int TRobotsTxtParser::GetLineNumber() {
return LineNumber;
}
const char* TRobotsTxtParser::ReadLine() {
Line = "";
char c;
if (IsLastSymbolCR) {
if (!InputStream.ReadChar(c))
return nullptr;
if (c != '\n')
Line.append(c);
}
bool hasMoreSymbols;
while (hasMoreSymbols = InputStream.ReadChar(c)) {
if (c == '\r') {
IsLastSymbolCR = true;
break;
} else {
IsLastSymbolCR = false;
if (c == '\n')
break;
Line.append(c);
}
}
if (!hasMoreSymbols && Line.empty())
return nullptr;
// BOM UTF-8: EF BB BF
if (0 == LineNumber && Line.size() >= 3 && Line[0] == '\xEF' && Line[1] == '\xBB' && Line[2] == '\xBF')
Line = Line.substr(3, Line.size() - 3);
++LineNumber;
int i = Line.find('#');
if (i == 0)
Line = "";
else if (i > 0)
Line = Line.substr(0, i);
return Line.data();
}
bool TRobotsTxtParser::IsBlankLine(const char* s) {
for (const char* p = s; *p; ++p)
if (!isspace(*p))
return 0;
return 1;
}
char* TRobotsTxtParser::Trim(char* s) {
while (isspace(*s))
++s;
char* p = s + strlen(s) - 1;
while (s < p && isspace(*p))
--p;
*(p + 1) = 0;
return s;
}
inline bool TRobotsTxtParser::IsRobotsLine(const char* s) {
return strchr(s, ':');
}
bool TRobotsTxtParser::HasRecord() {
while (!IsRobotsLine(Line.data()))
if (!ReadLine())
return 0;
return 1;
}
TRobotsTxtRulesRecord TRobotsTxtParser::NextRecord() {
return TRobotsTxtRulesRecord(*this);
}
TRobotsTxtRulesRecord::TRobotsTxtRulesRecord(TRobotsTxtParser& parser)
: Parser(parser)
{
}
bool TRobotsTxtRulesRecord::NextPair(TString& field, TString& value, bool handleErrors, TVector<int>& nonRobotsLines, bool* wasBlank) {
if (wasBlank) {
*wasBlank = false;
}
while (!Parser.IsRobotsLine(Parser.Line.data())) {
if (!Parser.ReadLine())
return 0;
if (Parser.IsBlankLine(Parser.Line.data())) {
if (wasBlank) {
*wasBlank = true;
}
continue;
}
if (handleErrors && !Parser.IsRobotsLine(Parser.Line.data()))
nonRobotsLines.push_back(Parser.GetLineNumber());
}
char* s = strchr(Parser.Line.begin(), ':');
*s = 0;
char* p = s + 1;
field = TRobotsTxtParser::Trim(strlwr(Parser.Line.begin()));
value = TRobotsTxtParser::Trim(p);
return 1;
}
|