diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/cppparser | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/cppparser')
-rw-r--r-- | library/cpp/cppparser/README.md | 3 | ||||
-rw-r--r-- | library/cpp/cppparser/parser.cpp | 739 | ||||
-rw-r--r-- | library/cpp/cppparser/parser.h | 99 | ||||
-rw-r--r-- | library/cpp/cppparser/ya.make | 9 |
4 files changed, 850 insertions, 0 deletions
diff --git a/library/cpp/cppparser/README.md b/library/cpp/cppparser/README.md new file mode 100644 index 0000000000..a498ef2a0f --- /dev/null +++ b/library/cpp/cppparser/README.md @@ -0,0 +1,3 @@ +A simple parser of C++ codes (only lexical analysis, no semantic checking) + +It is similar to a sax-parser by its interface. diff --git a/library/cpp/cppparser/parser.cpp b/library/cpp/cppparser/parser.cpp new file mode 100644 index 0000000000..3bd968b459 --- /dev/null +++ b/library/cpp/cppparser/parser.cpp @@ -0,0 +1,739 @@ +#include <util/generic/hash.h> +#include <util/string/cast.h> +#include <util/generic/hash_set.h> +#include <util/generic/yexception.h> + +#include "parser.h" + +//#define DEBUG_ME 1 + +TCppSaxParser::TText::TText() + : Offset(0) +{ +} + +TCppSaxParser::TText::TText(ui64 offset) + : Offset(offset) +{ +} + +TCppSaxParser::TText::TText(const TString& data, ui64 offset) + : Data(data) + , Offset(offset) +{ +} + +TCppSaxParser::TText::~TText() = default; + +void TCppSaxParser::TText::Reset() noexcept { + Offset += Data.length(); + Data.clear(); +} + +TCppSaxParser::TWorker::TWorker() noexcept = default; + +TCppSaxParser::TWorker::~TWorker() = default; + +class TCppSaxParser::TImpl { + enum EState { + Code, + CommentBegin, + String, + Character, + OneLineComment, + MultiLineComment, + MultiLineCommentEnd, + Preprocessor + }; + +public: + typedef TCppSaxParser::TText TText; + typedef TCppSaxParser::TWorker TWorker; + + inline TImpl(TWorker* worker) + : State_(Code) + , Worker_(worker) + , SkipNext_(false) + , Line_(0) + , Column_(0) + { + Worker_->DoStart(); + } + + inline ~TImpl() = default; + + inline void Write(const void* data, size_t len) { + ProcessInput((const char*)data, len); + } + + inline void Finish() { + if (!Text_.Data.empty()) { + switch (State_) { + case Code: + Worker_->DoCode(Text_); + + break; + + case Preprocessor: + Worker_->DoPreprocessor(Text_); + + break; + + case OneLineComment: + Worker_->DoOneLineComment(Text_); + + break; + + default: + ThrowError(); + } + } + + Worker_->DoEnd(); + } + +private: + inline void ProcessInput(const char* data, size_t len) { + EState savedState = Code; + while (len) { + const char ch = *data; + + if (ch == '\n') { + ++Line_; + Column_ = 0; + } else { + ++Column_; + } + +#if DEBUG_ME + Cerr << "char: " << ch << Endl; + Cerr << "state before: " << (unsigned int)State_ << Endl; +#endif + + retry: + switch (State_) { + case Code: { + savedState = Code; + switch (ch) { + case '/': + State_ = CommentBegin; + + break; + + case '"': + Action(ch); + State_ = String; + + break; + + case '\'': + Action(ch); + State_ = Character; + + break; + + case '#': + Action(ch); + State_ = Preprocessor; + + break; + + default: + Text_.Data += ch; + + break; + } + + break; + } + + case CommentBegin: { + switch (ch) { + case '/': + State_ = savedState; + savedState = Code; + Action("//"); + State_ = OneLineComment; + + break; + + case '*': + State_ = savedState; + Action("/*"); + State_ = MultiLineComment; + + break; + + default: + Text_.Data += '/'; + State_ = savedState; + + goto retry; + } + + break; + } + + case OneLineComment: { + switch (ch) { + case '\n': + Action(ch); + State_ = Code; + + break; + + default: + Text_.Data += ch; + + break; + } + + break; + } + + case MultiLineComment: { + switch (ch) { + case '*': + Text_.Data += ch; + State_ = MultiLineCommentEnd; + + break; + + case '\n': + Text_.Data += ch; + savedState = Code; + + break; + default: + Text_.Data += ch; + + break; + } + + break; + } + + case MultiLineCommentEnd: { + switch (ch) { + case '/': + Text_.Data += ch; + Action(); + State_ = savedState; + + break; + + default: + State_ = MultiLineComment; + + goto retry; + } + + break; + } + + case String: { + switch (ch) { + case '"': + Text_.Data += ch; + + if (SkipNext_) { + SkipNext_ = false; + } else { + if (savedState == Code) { + Action(); + } + State_ = savedState; + } + + break; + + case '\\': + Text_.Data += ch; + SkipNext_ = !SkipNext_; + + break; + + default: + Text_.Data += ch; + SkipNext_ = false; + + break; + } + + break; + } + + case Character: { + switch (ch) { + case '\'': + Text_.Data += ch; + + if (SkipNext_) { + SkipNext_ = false; + } else { + if (savedState == Code) { + Action(); + } + State_ = savedState; + } + + break; + + case '\\': + Text_.Data += ch; + SkipNext_ = !SkipNext_; + + break; + + default: + Text_.Data += ch; + SkipNext_ = false; + + break; + } + + break; + } + + case Preprocessor: { + savedState = Preprocessor; + switch (ch) { + case '/': + State_ = CommentBegin; + + break; + + case '\'': + Text_.Data += ch; + State_ = Character; + + break; + + case '"': + Text_.Data += ch; + State_ = String; + + break; + case '\n': + Text_.Data += ch; + + if (SkipNext_) { + SkipNext_ = false; + } else { + Action(); + savedState = Code; + State_ = Code; + } + + break; + + case '\\': + Text_.Data += ch; + SkipNext_ = true; + + break; + + default: + Text_.Data += ch; + SkipNext_ = false; + + break; + } + + break; + } + + default: + ThrowError(); + } + +#if DEBUG_ME + Cerr << "state after: " << (unsigned int)State_ << Endl; +#endif + + ++data; + --len; + } + } + + inline void Action(char ch) { + Action(); + Text_.Data += ch; + } + + inline void Action(const char* st) { + Action(); + Text_.Data += st; + } + + inline void Action() { + switch (State_) { + case Code: + Worker_->DoCode(Text_); + + break; + + case OneLineComment: + Worker_->DoOneLineComment(Text_); + + break; + + case MultiLineCommentEnd: + Worker_->DoMultiLineComment(Text_); + + break; + + case Preprocessor: + Worker_->DoPreprocessor(Text_); + + break; + + case String: + Worker_->DoString(Text_); + + break; + + case Character: + Worker_->DoCharacter(Text_); + + break; + + default: + ThrowError(); + } + + Text_.Reset(); + } + + inline void ThrowError() const { + ythrow yexception() << "can not parse source(line = " << (unsigned)Line_ + 1 << ", column = " << (unsigned)Column_ + 1 << ")"; + } + +private: + EState State_; + TWorker* Worker_; + TText Text_; + bool SkipNext_; + ui64 Line_; + ui64 Column_; +}; + +TCppSaxParser::TCppSaxParser(TWorker* worker) + : Impl_(new TImpl(worker)) +{ +} + +TCppSaxParser::~TCppSaxParser() = default; + +void TCppSaxParser::DoWrite(const void* data, size_t len) { + Impl_->Write(data, len); +} + +void TCppSaxParser::DoFinish() { + Impl_->Finish(); +} + +TCppSimpleSax::TCppSimpleSax() noexcept { +} + +TCppSimpleSax::~TCppSimpleSax() = default; + +void TCppSimpleSax::DoCode(const TText& text) { + static const char char_types[] = { + 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 0, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; + + static const char CWHITESPACE = 0; + static const char CIDENTIFIER = 1; + static const char CSYNTAX = 2; + + enum EState { + WhiteSpace = CWHITESPACE, + Identifier = CIDENTIFIER, + Syntax = CSYNTAX + }; + + EState state = Identifier; + TText cur(text.Offset); + + for (const auto& it : text.Data) { + const unsigned char ch = *(const unsigned char*)(&it); + const char type = char_types[ch]; + + switch (state) { + case Identifier: { + switch (type) { + case CIDENTIFIER: + cur.Data += ch; + + break; + + default: + if (!cur.Data.empty()) { + DoIdentifier(cur); + } + + cur.Reset(); + cur.Data += ch; + state = (EState)type; + + break; + } + + break; + } + + case WhiteSpace: { + switch (type) { + case CWHITESPACE: + cur.Data += ch; + + break; + + default: + DoWhiteSpace(cur); + cur.Reset(); + cur.Data += ch; + state = (EState)type; + + break; + } + + break; + } + + case Syntax: { + switch (type) { + case CSYNTAX: + cur.Data += ch; + + break; + + default: + DoSyntax(cur); + cur.Reset(); + cur.Data += ch; + state = (EState)type; + + break; + } + + break; + } + } + } + + if (!cur.Data.empty()) { + switch (state) { + case Identifier: + DoIdentifier(cur); + + break; + + case WhiteSpace: + DoWhiteSpace(cur); + + break; + + case Syntax: + DoSyntax(cur); + + break; + } + } +} + +class TCppFullSax::TImpl { + typedef THashSet<TString> TKeyWords; + + class TRegExp { + public: + inline TRegExp(const char*) { + } + + inline bool Match(const TString& /*s*/) const noexcept { + return false; + } + }; + +public: + inline TImpl() + : OctNumber_("^[+-]?0[0-7]+$") + , HexNumber_("^[+-]?0x[0-9A-Fa-f]+$") + , DecNumber_("^[+-]?[0-9]+$") + , FltNumber_("^[+-]?[0-9]*\\.[0-9]*$") + { + AddKeyword("extern"); + AddKeyword("static"); + AddKeyword("inline"); + AddKeyword("volatile"); + AddKeyword("asm"); + AddKeyword("const"); + AddKeyword("mutable"); + AddKeyword("char"); + AddKeyword("signed"); + AddKeyword("unsigned"); + AddKeyword("int"); + AddKeyword("short"); + AddKeyword("long"); + AddKeyword("double"); + AddKeyword("float"); + AddKeyword("bool"); + AddKeyword("class"); + AddKeyword("struct"); + AddKeyword("union"); + AddKeyword("void"); + AddKeyword("auto"); + AddKeyword("throw"); + AddKeyword("try"); + AddKeyword("catch"); + AddKeyword("for"); + AddKeyword("do"); + AddKeyword("if"); + AddKeyword("else"); + AddKeyword("while"); + AddKeyword("switch"); + AddKeyword("case"); + AddKeyword("default"); + AddKeyword("goto"); + AddKeyword("break"); + AddKeyword("continue"); + AddKeyword("virtual"); + AddKeyword("template"); + AddKeyword("typename"); + AddKeyword("enum"); + AddKeyword("public"); + AddKeyword("private"); + AddKeyword("protected"); + AddKeyword("using"); + AddKeyword("namespace"); + AddKeyword("typedef"); + AddKeyword("true"); + AddKeyword("false"); + AddKeyword("return"); + AddKeyword("new"); + AddKeyword("delete"); + AddKeyword("operator"); + AddKeyword("friend"); + AddKeyword("this"); + } + + inline ~TImpl() = default; + + inline void AddKeyword(const TString& keyword) { + KeyWords_.insert(keyword); + } + + inline bool IsKeyword(const TString& s) { + return KeyWords_.find(s) != KeyWords_.end(); + } + + inline bool IsOctNumber(const TString& s) { + return OctNumber_.Match(s); + } + + inline bool IsHexNumber(const TString& s) { + return HexNumber_.Match(s); + } + + inline bool IsDecNumber(const TString& s) { + return DecNumber_.Match(s); + } + + inline bool IsFloatNumber(const TString& s) { + return FltNumber_.Match(s); + } + +private: + const TRegExp OctNumber_; + const TRegExp HexNumber_; + const TRegExp DecNumber_; + const TRegExp FltNumber_; + TKeyWords KeyWords_; +}; + +TCppFullSax::TCppFullSax() + : Impl_(new TImpl()) +{ +} + +TCppFullSax::~TCppFullSax() = default; + +void TCppFullSax::AddKeyword(const TString& keyword) { + Impl_->AddKeyword(keyword); +} + +void TCppFullSax::DoIdentifier(const TText& text) { + if (Impl_->IsKeyword(text.Data)) { + DoKeyword(text); + } else if (Impl_->IsOctNumber(text.Data)) { + DoOctNumber(text); + } else if (Impl_->IsHexNumber(text.Data)) { + DoHexNumber(text); + } else if (Impl_->IsDecNumber(text.Data)) { + DoDecNumber(text); + } else if (Impl_->IsFloatNumber(text.Data)) { + DoFloatNumber(text); + } else { + DoName(text); + } +} + +void TCppFullSax::DoEnd() { +} + +void TCppFullSax::DoStart() { +} + +void TCppFullSax::DoString(const TText&) { +} + +void TCppFullSax::DoCharacter(const TText&) { +} + +void TCppFullSax::DoWhiteSpace(const TText&) { +} + +void TCppFullSax::DoKeyword(const TText&) { +} + +void TCppFullSax::DoName(const TText&) { +} + +void TCppFullSax::DoOctNumber(const TText&) { +} + +void TCppFullSax::DoHexNumber(const TText&) { +} + +void TCppFullSax::DoDecNumber(const TText&) { +} + +void TCppFullSax::DoFloatNumber(const TText&) { +} + +void TCppFullSax::DoSyntax(const TText&) { +} + +void TCppFullSax::DoOneLineComment(const TText&) { +} + +void TCppFullSax::DoMultiLineComment(const TText&) { +} + +void TCppFullSax::DoPreprocessor(const TText&) { +} diff --git a/library/cpp/cppparser/parser.h b/library/cpp/cppparser/parser.h new file mode 100644 index 0000000000..f3e4bcbadd --- /dev/null +++ b/library/cpp/cppparser/parser.h @@ -0,0 +1,99 @@ +#pragma once + +#include <util/generic/ptr.h> +#include <util/generic/string.h> +#include <util/stream/output.h> + +class TCppSaxParser: public IOutputStream { +public: + struct TText { + TText(); + TText(ui64 offset); + TText(const TString& data, ui64 offset); + ~TText(); + + void Reset() noexcept; + + TString Data; + ui64 Offset; + }; + + class TWorker { + public: + typedef TCppSaxParser::TText TText; + + TWorker() noexcept; + virtual ~TWorker(); + + virtual void DoEnd() = 0; + virtual void DoStart() = 0; + virtual void DoString(const TText& text) = 0; + virtual void DoCharacter(const TText& text) = 0; + virtual void DoCode(const TText& text) = 0; + virtual void DoOneLineComment(const TText& text) = 0; + virtual void DoMultiLineComment(const TText& text) = 0; + virtual void DoPreprocessor(const TText& text) = 0; + }; + + TCppSaxParser(TWorker* worker); + ~TCppSaxParser() override; + +private: + void DoWrite(const void* data, size_t len) override; + void DoFinish() override; + +private: + class TImpl; + THolder<TImpl> Impl_; +}; + +class TCppSimpleSax: public TCppSaxParser::TWorker { +public: + TCppSimpleSax() noexcept; + ~TCppSimpleSax() override; + + void DoEnd() override = 0; + void DoStart() override = 0; + void DoString(const TText& text) override = 0; + void DoCharacter(const TText& text) override = 0; + virtual void DoWhiteSpace(const TText& text) = 0; + virtual void DoIdentifier(const TText& text) = 0; + virtual void DoSyntax(const TText& text) = 0; + void DoOneLineComment(const TText& text) override = 0; + void DoMultiLineComment(const TText& text) override = 0; + void DoPreprocessor(const TText& text) override = 0; + +private: + void DoCode(const TText& text) override; +}; + +class TCppFullSax: public TCppSimpleSax { +public: + TCppFullSax(); + ~TCppFullSax() override; + + void DoEnd() override; + void DoStart() override; + void DoString(const TText& text) override; + void DoCharacter(const TText& text) override; + void DoWhiteSpace(const TText& text) override; + virtual void DoKeyword(const TText& text); + virtual void DoName(const TText& text); + virtual void DoOctNumber(const TText& text); + virtual void DoHexNumber(const TText& text); + virtual void DoDecNumber(const TText& text); + virtual void DoFloatNumber(const TText& text); + void DoSyntax(const TText& text) override; + void DoOneLineComment(const TText& text) override; + void DoMultiLineComment(const TText& text) override; + void DoPreprocessor(const TText& text) override; + + void AddKeyword(const TString& keyword); + +private: + void DoIdentifier(const TText& text) override; + +private: + class TImpl; + THolder<TImpl> Impl_; +}; diff --git a/library/cpp/cppparser/ya.make b/library/cpp/cppparser/ya.make new file mode 100644 index 0000000000..bbb0bc11cd --- /dev/null +++ b/library/cpp/cppparser/ya.make @@ -0,0 +1,9 @@ +LIBRARY() + +OWNER(pg) + +SRCS( + parser.cpp +) + +END() |