diff options
| author | Devtools Arcadia <[email protected]> | 2022-02-07 18:08:42 +0300 | 
|---|---|---|
| committer | Devtools Arcadia <[email protected]> | 2022-02-07 18:08:42 +0300 | 
| commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
| tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/cppparser | |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/cppparser')
| -rw-r--r-- | library/cpp/cppparser/README.md | 3 | ||||
| -rw-r--r-- | library/cpp/cppparser/parser.cpp | 739 | ||||
| -rw-r--r-- | library/cpp/cppparser/parser.h | 99 | ||||
| -rw-r--r-- | library/cpp/cppparser/ya.make | 9 | 
4 files changed, 850 insertions, 0 deletions
| diff --git a/library/cpp/cppparser/README.md b/library/cpp/cppparser/README.md new file mode 100644 index 00000000000..a498ef2a0f0 --- /dev/null +++ b/library/cpp/cppparser/README.md @@ -0,0 +1,3 @@ +A simple parser of C++ codes (only lexical analysis, no semantic checking) + +It is similar to a sax-parser by its interface. diff --git a/library/cpp/cppparser/parser.cpp b/library/cpp/cppparser/parser.cpp new file mode 100644 index 00000000000..3bd968b4594 --- /dev/null +++ b/library/cpp/cppparser/parser.cpp @@ -0,0 +1,739 @@ +#include <util/generic/hash.h> +#include <util/string/cast.h> +#include <util/generic/hash_set.h> +#include <util/generic/yexception.h> + +#include "parser.h" + +//#define DEBUG_ME 1 + +TCppSaxParser::TText::TText() +    : Offset(0) +{ +} + +TCppSaxParser::TText::TText(ui64 offset) +    : Offset(offset) +{ +} + +TCppSaxParser::TText::TText(const TString& data, ui64 offset) +    : Data(data) +    , Offset(offset) +{ +} + +TCppSaxParser::TText::~TText() = default; + +void TCppSaxParser::TText::Reset() noexcept { +    Offset += Data.length(); +    Data.clear(); +} + +TCppSaxParser::TWorker::TWorker() noexcept = default; + +TCppSaxParser::TWorker::~TWorker() = default; + +class TCppSaxParser::TImpl { +    enum EState { +        Code, +        CommentBegin, +        String, +        Character, +        OneLineComment, +        MultiLineComment, +        MultiLineCommentEnd, +        Preprocessor +    }; + +public: +    typedef TCppSaxParser::TText TText; +    typedef TCppSaxParser::TWorker TWorker; + +    inline TImpl(TWorker* worker) +        : State_(Code) +        , Worker_(worker) +        , SkipNext_(false) +        , Line_(0) +        , Column_(0) +    { +        Worker_->DoStart(); +    } + +    inline ~TImpl() = default; + +    inline void Write(const void* data, size_t len) { +        ProcessInput((const char*)data, len); +    } + +    inline void Finish() { +        if (!Text_.Data.empty()) { +            switch (State_) { +                case Code: +                    Worker_->DoCode(Text_); + +                    break; + +                case Preprocessor: +                    Worker_->DoPreprocessor(Text_); + +                    break; + +                case OneLineComment: +                    Worker_->DoOneLineComment(Text_); + +                    break; + +                default: +                    ThrowError(); +            } +        } + +        Worker_->DoEnd(); +    } + +private: +    inline void ProcessInput(const char* data, size_t len) { +        EState savedState = Code; +        while (len) { +            const char ch = *data; + +            if (ch == '\n') { +                ++Line_; +                Column_ = 0; +            } else { +                ++Column_; +            } + +#if DEBUG_ME +            Cerr << "char: " << ch << Endl; +            Cerr << "state before: " << (unsigned int)State_ << Endl; +#endif + +        retry: +            switch (State_) { +                case Code: { +                    savedState = Code; +                    switch (ch) { +                        case '/': +                            State_ = CommentBegin; + +                            break; + +                        case '"': +                            Action(ch); +                            State_ = String; + +                            break; + +                        case '\'': +                            Action(ch); +                            State_ = Character; + +                            break; + +                        case '#': +                            Action(ch); +                            State_ = Preprocessor; + +                            break; + +                        default: +                            Text_.Data += ch; + +                            break; +                    } + +                    break; +                } + +                case CommentBegin: { +                    switch (ch) { +                        case '/': +                            State_ = savedState; +                            savedState = Code; +                            Action("//"); +                            State_ = OneLineComment; + +                            break; + +                        case '*': +                            State_ = savedState; +                            Action("/*"); +                            State_ = MultiLineComment; + +                            break; + +                        default: +                            Text_.Data += '/'; +                            State_ = savedState; + +                            goto retry; +                    } + +                    break; +                } + +                case OneLineComment: { +                    switch (ch) { +                        case '\n': +                            Action(ch); +                            State_ = Code; + +                            break; + +                        default: +                            Text_.Data += ch; + +                            break; +                    } + +                    break; +                } + +                case MultiLineComment: { +                    switch (ch) { +                        case '*': +                            Text_.Data += ch; +                            State_ = MultiLineCommentEnd; + +                            break; + +                        case '\n': +                            Text_.Data += ch; +                            savedState = Code; + +                            break; +                        default: +                            Text_.Data += ch; + +                            break; +                    } + +                    break; +                } + +                case MultiLineCommentEnd: { +                    switch (ch) { +                        case '/': +                            Text_.Data += ch; +                            Action(); +                            State_ = savedState; + +                            break; + +                        default: +                            State_ = MultiLineComment; + +                            goto retry; +                    } + +                    break; +                } + +                case String: { +                    switch (ch) { +                        case '"': +                            Text_.Data += ch; + +                            if (SkipNext_) { +                                SkipNext_ = false; +                            } else { +                                if (savedState == Code) { +                                    Action(); +                                } +                                State_ = savedState; +                            } + +                            break; + +                        case '\\': +                            Text_.Data += ch; +                            SkipNext_ = !SkipNext_; + +                            break; + +                        default: +                            Text_.Data += ch; +                            SkipNext_ = false; + +                            break; +                    } + +                    break; +                } + +                case Character: { +                    switch (ch) { +                        case '\'': +                            Text_.Data += ch; + +                            if (SkipNext_) { +                                SkipNext_ = false; +                            } else { +                                if (savedState == Code) { +                                    Action(); +                                } +                                State_ = savedState; +                            } + +                            break; + +                        case '\\': +                            Text_.Data += ch; +                            SkipNext_ = !SkipNext_; + +                            break; + +                        default: +                            Text_.Data += ch; +                            SkipNext_ = false; + +                            break; +                    } + +                    break; +                } + +                case Preprocessor: { +                    savedState = Preprocessor; +                    switch (ch) { +                        case '/': +                            State_ = CommentBegin; + +                            break; + +                        case '\'': +                            Text_.Data += ch; +                            State_ = Character; + +                            break; + +                        case '"': +                            Text_.Data += ch; +                            State_ = String; + +                            break; +                        case '\n': +                            Text_.Data += ch; + +                            if (SkipNext_) { +                                SkipNext_ = false; +                            } else { +                                Action(); +                                savedState = Code; +                                State_ = Code; +                            } + +                            break; + +                        case '\\': +                            Text_.Data += ch; +                            SkipNext_ = true; + +                            break; + +                        default: +                            Text_.Data += ch; +                            SkipNext_ = false; + +                            break; +                    } + +                    break; +                } + +                default: +                    ThrowError(); +            } + +#if DEBUG_ME +            Cerr << "state after: " << (unsigned int)State_ << Endl; +#endif + +            ++data; +            --len; +        } +    } + +    inline void Action(char ch) { +        Action(); +        Text_.Data += ch; +    } + +    inline void Action(const char* st) { +        Action(); +        Text_.Data += st; +    } + +    inline void Action() { +        switch (State_) { +            case Code: +                Worker_->DoCode(Text_); + +                break; + +            case OneLineComment: +                Worker_->DoOneLineComment(Text_); + +                break; + +            case MultiLineCommentEnd: +                Worker_->DoMultiLineComment(Text_); + +                break; + +            case Preprocessor: +                Worker_->DoPreprocessor(Text_); + +                break; + +            case String: +                Worker_->DoString(Text_); + +                break; + +            case Character: +                Worker_->DoCharacter(Text_); + +                break; + +            default: +                ThrowError(); +        } + +        Text_.Reset(); +    } + +    inline void ThrowError() const { +        ythrow yexception() << "can not parse source(line = " << (unsigned)Line_ + 1 << ", column = " << (unsigned)Column_ + 1 << ")"; +    } + +private: +    EState State_; +    TWorker* Worker_; +    TText Text_; +    bool SkipNext_; +    ui64 Line_; +    ui64 Column_; +}; + +TCppSaxParser::TCppSaxParser(TWorker* worker) +    : Impl_(new TImpl(worker)) +{ +} + +TCppSaxParser::~TCppSaxParser() = default; + +void TCppSaxParser::DoWrite(const void* data, size_t len) { +    Impl_->Write(data, len); +} + +void TCppSaxParser::DoFinish() { +    Impl_->Finish(); +} + +TCppSimpleSax::TCppSimpleSax() noexcept { +} + +TCppSimpleSax::~TCppSimpleSax() = default; + +void TCppSimpleSax::DoCode(const TText& text) { +    static const char char_types[] = { +        2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 0, 2, 2, +        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +        0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, +        2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, +        2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, +        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; + +    static const char CWHITESPACE = 0; +    static const char CIDENTIFIER = 1; +    static const char CSYNTAX = 2; + +    enum EState { +        WhiteSpace = CWHITESPACE, +        Identifier = CIDENTIFIER, +        Syntax = CSYNTAX +    }; + +    EState state = Identifier; +    TText cur(text.Offset); + +    for (const auto& it : text.Data) { +        const unsigned char ch = *(const unsigned char*)(&it); +        const char type = char_types[ch]; + +        switch (state) { +            case Identifier: { +                switch (type) { +                    case CIDENTIFIER: +                        cur.Data += ch; + +                        break; + +                    default: +                        if (!cur.Data.empty()) { +                            DoIdentifier(cur); +                        } + +                        cur.Reset(); +                        cur.Data += ch; +                        state = (EState)type; + +                        break; +                } + +                break; +            } + +            case WhiteSpace: { +                switch (type) { +                    case CWHITESPACE: +                        cur.Data += ch; + +                        break; + +                    default: +                        DoWhiteSpace(cur); +                        cur.Reset(); +                        cur.Data += ch; +                        state = (EState)type; + +                        break; +                } + +                break; +            } + +            case Syntax: { +                switch (type) { +                    case CSYNTAX: +                        cur.Data += ch; + +                        break; + +                    default: +                        DoSyntax(cur); +                        cur.Reset(); +                        cur.Data += ch; +                        state = (EState)type; + +                        break; +                } + +                break; +            } +        } +    } + +    if (!cur.Data.empty()) { +        switch (state) { +            case Identifier: +                DoIdentifier(cur); + +                break; + +            case WhiteSpace: +                DoWhiteSpace(cur); + +                break; + +            case Syntax: +                DoSyntax(cur); + +                break; +        } +    } +} + +class TCppFullSax::TImpl { +    typedef THashSet<TString> TKeyWords; + +    class TRegExp { +    public: +        inline TRegExp(const char*) { +        } + +        inline bool Match(const TString& /*s*/) const noexcept { +            return false; +        } +    }; + +public: +    inline TImpl() +        : OctNumber_("^[+-]?0[0-7]+$") +        , HexNumber_("^[+-]?0x[0-9A-Fa-f]+$") +        , DecNumber_("^[+-]?[0-9]+$") +        , FltNumber_("^[+-]?[0-9]*\\.[0-9]*$") +    { +        AddKeyword("extern"); +        AddKeyword("static"); +        AddKeyword("inline"); +        AddKeyword("volatile"); +        AddKeyword("asm"); +        AddKeyword("const"); +        AddKeyword("mutable"); +        AddKeyword("char"); +        AddKeyword("signed"); +        AddKeyword("unsigned"); +        AddKeyword("int"); +        AddKeyword("short"); +        AddKeyword("long"); +        AddKeyword("double"); +        AddKeyword("float"); +        AddKeyword("bool"); +        AddKeyword("class"); +        AddKeyword("struct"); +        AddKeyword("union"); +        AddKeyword("void"); +        AddKeyword("auto"); +        AddKeyword("throw"); +        AddKeyword("try"); +        AddKeyword("catch"); +        AddKeyword("for"); +        AddKeyword("do"); +        AddKeyword("if"); +        AddKeyword("else"); +        AddKeyword("while"); +        AddKeyword("switch"); +        AddKeyword("case"); +        AddKeyword("default"); +        AddKeyword("goto"); +        AddKeyword("break"); +        AddKeyword("continue"); +        AddKeyword("virtual"); +        AddKeyword("template"); +        AddKeyword("typename"); +        AddKeyword("enum"); +        AddKeyword("public"); +        AddKeyword("private"); +        AddKeyword("protected"); +        AddKeyword("using"); +        AddKeyword("namespace"); +        AddKeyword("typedef"); +        AddKeyword("true"); +        AddKeyword("false"); +        AddKeyword("return"); +        AddKeyword("new"); +        AddKeyword("delete"); +        AddKeyword("operator"); +        AddKeyword("friend"); +        AddKeyword("this"); +    } + +    inline ~TImpl() = default; + +    inline void AddKeyword(const TString& keyword) { +        KeyWords_.insert(keyword); +    } + +    inline bool IsKeyword(const TString& s) { +        return KeyWords_.find(s) != KeyWords_.end(); +    } + +    inline bool IsOctNumber(const TString& s) { +        return OctNumber_.Match(s); +    } + +    inline bool IsHexNumber(const TString& s) { +        return HexNumber_.Match(s); +    } + +    inline bool IsDecNumber(const TString& s) { +        return DecNumber_.Match(s); +    } + +    inline bool IsFloatNumber(const TString& s) { +        return FltNumber_.Match(s); +    } + +private: +    const TRegExp OctNumber_; +    const TRegExp HexNumber_; +    const TRegExp DecNumber_; +    const TRegExp FltNumber_; +    TKeyWords KeyWords_; +}; + +TCppFullSax::TCppFullSax() +    : Impl_(new TImpl()) +{ +} + +TCppFullSax::~TCppFullSax() = default; + +void TCppFullSax::AddKeyword(const TString& keyword) { +    Impl_->AddKeyword(keyword); +} + +void TCppFullSax::DoIdentifier(const TText& text) { +    if (Impl_->IsKeyword(text.Data)) { +        DoKeyword(text); +    } else if (Impl_->IsOctNumber(text.Data)) { +        DoOctNumber(text); +    } else if (Impl_->IsHexNumber(text.Data)) { +        DoHexNumber(text); +    } else if (Impl_->IsDecNumber(text.Data)) { +        DoDecNumber(text); +    } else if (Impl_->IsFloatNumber(text.Data)) { +        DoFloatNumber(text); +    } else { +        DoName(text); +    } +} + +void TCppFullSax::DoEnd() { +} + +void TCppFullSax::DoStart() { +} + +void TCppFullSax::DoString(const TText&) { +} + +void TCppFullSax::DoCharacter(const TText&) { +} + +void TCppFullSax::DoWhiteSpace(const TText&) { +} + +void TCppFullSax::DoKeyword(const TText&) { +} + +void TCppFullSax::DoName(const TText&) { +} + +void TCppFullSax::DoOctNumber(const TText&) { +} + +void TCppFullSax::DoHexNumber(const TText&) { +} + +void TCppFullSax::DoDecNumber(const TText&) { +} + +void TCppFullSax::DoFloatNumber(const TText&) { +} + +void TCppFullSax::DoSyntax(const TText&) { +} + +void TCppFullSax::DoOneLineComment(const TText&) { +} + +void TCppFullSax::DoMultiLineComment(const TText&) { +} + +void TCppFullSax::DoPreprocessor(const TText&) { +} diff --git a/library/cpp/cppparser/parser.h b/library/cpp/cppparser/parser.h new file mode 100644 index 00000000000..f3e4bcbadd1 --- /dev/null +++ b/library/cpp/cppparser/parser.h @@ -0,0 +1,99 @@ +#pragma once + +#include <util/generic/ptr.h> +#include <util/generic/string.h> +#include <util/stream/output.h> + +class TCppSaxParser: public IOutputStream { +public: +    struct TText { +        TText(); +        TText(ui64 offset); +        TText(const TString& data, ui64 offset); +        ~TText(); + +        void Reset() noexcept; + +        TString Data; +        ui64 Offset; +    }; + +    class TWorker { +    public: +        typedef TCppSaxParser::TText TText; + +        TWorker() noexcept; +        virtual ~TWorker(); + +        virtual void DoEnd() = 0; +        virtual void DoStart() = 0; +        virtual void DoString(const TText& text) = 0; +        virtual void DoCharacter(const TText& text) = 0; +        virtual void DoCode(const TText& text) = 0; +        virtual void DoOneLineComment(const TText& text) = 0; +        virtual void DoMultiLineComment(const TText& text) = 0; +        virtual void DoPreprocessor(const TText& text) = 0; +    }; + +    TCppSaxParser(TWorker* worker); +    ~TCppSaxParser() override; + +private: +    void DoWrite(const void* data, size_t len) override; +    void DoFinish() override; + +private: +    class TImpl; +    THolder<TImpl> Impl_; +}; + +class TCppSimpleSax: public TCppSaxParser::TWorker { +public: +    TCppSimpleSax() noexcept; +    ~TCppSimpleSax() override; + +    void DoEnd() override = 0; +    void DoStart() override = 0; +    void DoString(const TText& text) override = 0; +    void DoCharacter(const TText& text) override = 0; +    virtual void DoWhiteSpace(const TText& text) = 0; +    virtual void DoIdentifier(const TText& text) = 0; +    virtual void DoSyntax(const TText& text) = 0; +    void DoOneLineComment(const TText& text) override = 0; +    void DoMultiLineComment(const TText& text) override = 0; +    void DoPreprocessor(const TText& text) override = 0; + +private: +    void DoCode(const TText& text) override; +}; + +class TCppFullSax: public TCppSimpleSax { +public: +    TCppFullSax(); +    ~TCppFullSax() override; + +    void DoEnd() override; +    void DoStart() override; +    void DoString(const TText& text) override; +    void DoCharacter(const TText& text) override; +    void DoWhiteSpace(const TText& text) override; +    virtual void DoKeyword(const TText& text); +    virtual void DoName(const TText& text); +    virtual void DoOctNumber(const TText& text); +    virtual void DoHexNumber(const TText& text); +    virtual void DoDecNumber(const TText& text); +    virtual void DoFloatNumber(const TText& text); +    void DoSyntax(const TText& text) override; +    void DoOneLineComment(const TText& text) override; +    void DoMultiLineComment(const TText& text) override; +    void DoPreprocessor(const TText& text) override; + +    void AddKeyword(const TString& keyword); + +private: +    void DoIdentifier(const TText& text) override; + +private: +    class TImpl; +    THolder<TImpl> Impl_; +}; diff --git a/library/cpp/cppparser/ya.make b/library/cpp/cppparser/ya.make new file mode 100644 index 00000000000..bbb0bc11cd6 --- /dev/null +++ b/library/cpp/cppparser/ya.make @@ -0,0 +1,9 @@ +LIBRARY() + +OWNER(pg) + +SRCS( +    parser.cpp +) + +END() | 
