diff options
| author | Devtools Arcadia <[email protected]> | 2022-02-07 18:08:42 +0300 | 
|---|---|---|
| committer | Devtools Arcadia <[email protected]> | 2022-02-07 18:08:42 +0300 | 
| commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
| tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/containers/comptrie/comptrie_impl.h | |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/containers/comptrie/comptrie_impl.h')
| -rw-r--r-- | library/cpp/containers/comptrie/comptrie_impl.h | 221 | 
1 files changed, 221 insertions, 0 deletions
diff --git a/library/cpp/containers/comptrie/comptrie_impl.h b/library/cpp/containers/comptrie/comptrie_impl.h new file mode 100644 index 00000000000..f41c38311a4 --- /dev/null +++ b/library/cpp/containers/comptrie/comptrie_impl.h @@ -0,0 +1,221 @@ +#pragma once + +#include <util/stream/output.h> + +#ifndef COMPTRIE_DATA_CHECK +#define COMPTRIE_DATA_CHECK 1 +#endif + +// NCompactTrie + +namespace NCompactTrie { +    const char MT_FINAL = '\x80'; +    const char MT_NEXT = '\x40'; +    const char MT_SIZEMASK = '\x07'; +    const size_t MT_LEFTSHIFT = 3; +    const size_t MT_RIGHTSHIFT = 0; + +    Y_FORCE_INLINE size_t UnpackOffset(const char* p, size_t len); +    size_t MeasureOffset(size_t offset); +    size_t PackOffset(char* buffer, size_t offset); +    static inline ui64 ArcSaveOffset(size_t offset, IOutputStream& os); +    Y_FORCE_INLINE char LeapByte(const char*& datapos, const char* dataend, char label); + +    template <class T> +    inline static size_t ExtraBits() { +        return (sizeof(T) - 1) * 8; +    } + +    static inline bool IsEpsilonLink(const char flags) { +        return !(flags & (MT_FINAL | MT_NEXT)); +    } + +    static inline void TraverseEpsilon(const char*& datapos) { +        const char flags = *datapos; +        if (!IsEpsilonLink(flags)) { +            return; +        } +        const size_t offsetlength = flags & MT_SIZEMASK; +        const size_t offset = UnpackOffset(datapos + 1, offsetlength); +        Y_ASSERT(offset); +        datapos += offset; +    } + +    static inline size_t LeftOffsetLen(const char flags) { +        return (flags >> MT_LEFTSHIFT) & MT_SIZEMASK; +    } + +    static inline size_t RightOffsetLen(const char flags) { +        return flags & MT_SIZEMASK; +    } + +    void ShowProgress(size_t n); // just print dots +} + +namespace NCompTriePrivate { +    template <typename TChar> +    struct TStringForChar { +    }; + +    template <> +    struct TStringForChar<char> { +        typedef TString TResult; +    }; + +    template <> +    struct TStringForChar<wchar16> { +        typedef TUtf16String TResult; +    }; + +    template <> +    struct TStringForChar<wchar32> { +        typedef TUtf32String TResult; +    }; + +} + +namespace NCompTriePrivate { +    struct TCmp { +        template <class T> +        inline bool operator()(const T& l, const T& r) { +            return (unsigned char)(l.Label[0]) < (unsigned char)(r.Label[0]); +        } + +        template <class T> +        inline bool operator()(const T& l, char r) { +            return (unsigned char)(l.Label[0]) < (unsigned char)r; +        } +    }; +} + +namespace NCompactTrie { +    static inline ui64 ArcSaveOffset(size_t offset, IOutputStream& os) { +        using namespace NCompactTrie; + +        if (!offset) +            return 0; + +        char buf[16]; +        size_t len = PackOffset(buf, offset); +        os.Write(buf, len); +        return len; +    } + +    // Unpack the offset to the next node. The encoding scheme can store offsets +    // up to 7 bytes; whether they fit into size_t is another issue. +    Y_FORCE_INLINE size_t UnpackOffset(const char* p, size_t len) { +        size_t result = 0; + +        while (len--) +            result = ((result << 8) | (*(p++) & 0xFF)); + +        return result; +    } + +    // Auxiliary function: consumes one character from the input. Advances the data pointer +    // to the position immediately preceding the value for the link just traversed (if any); +    // returns flags associated with the link. If no arc with the required label is present, +    // zeroes the data pointer. +    Y_FORCE_INLINE char LeapByte(const char*& datapos, const char* dataend, char label) { +        while (datapos < dataend) { +            size_t offsetlength, offset; +            const char* startpos = datapos; +            char flags = *(datapos++); + +            if (IsEpsilonLink(flags)) { +                // Epsilon link - jump to the specified offset without further checks. +                // These links are created during minimization: original uncompressed +                // tree does not need them. (If we find a way to package 3 offset lengths +                // into 1 byte, we could get rid of them; but it looks like they do no harm. +                Y_ASSERT(datapos < dataend); +                offsetlength = flags & MT_SIZEMASK; +                offset = UnpackOffset(datapos, offsetlength); +                if (!offset) +                    break; +                datapos = startpos + offset; + +                continue; +            } + +            char ch = *(datapos++); + +            // Left branch +            offsetlength = LeftOffsetLen(flags); +            if ((unsigned char)label < (unsigned char)ch) { +                offset = UnpackOffset(datapos, offsetlength); +                if (!offset) +                    break; + +                datapos = startpos + offset; + +                continue; +            } + +            datapos += offsetlength; + +            // Right branch +            offsetlength = RightOffsetLen(flags); +            if ((unsigned char)label > (unsigned char)ch) { +                offset = UnpackOffset(datapos, offsetlength); + +                if (!offset) +                    break; + +                datapos = startpos + offset; + +                continue; +            } + +            // Got a match; return position right before the contents for the label +            datapos += offsetlength; +            return flags; +        } + +        // if we got here, we're past the dataend - bail out ASAP +        datapos = nullptr; +        return 0; +    } + +    // Auxiliary function: consumes one (multibyte) symbol from the input. +    // Advances the data pointer to the root of the subtrie beginning after the symbol, +    // zeroes it if this subtrie is empty. +    // If there is a value associated with the symbol, makes the value pointer point to it, +    // otherwise sets it to nullptr. +    // Returns true if the symbol was succesfully found in the trie, false otherwise. +    template <typename TSymbol, class TPacker> +    Y_FORCE_INLINE bool Advance(const char*& datapos, const char* const dataend, const char*& value, +                                TSymbol label, TPacker packer) { +        Y_ASSERT(datapos < dataend); +        char flags = MT_NEXT; +        for (int i = (int)ExtraBits<TSymbol>(); i >= 0; i -= 8) { +            flags = LeapByte(datapos, dataend, (char)(label >> i)); +            if (!datapos) { +                return false; // no such arc +            } + +            value = nullptr; + +            Y_ASSERT(datapos <= dataend); +            if ((flags & MT_FINAL)) { +                value = datapos; +                datapos += packer.SkipLeaf(datapos); +            } + +            if (!(flags & MT_NEXT)) { +                if (i == 0) { +                    datapos = nullptr; +                    return true; +                } +                return false; // no further way +            } + +            TraverseEpsilon(datapos); +            if (i == 0) { // last byte, and got a match +                return true; +            } +        } + +        return false; +    } + +}  | 
