aboutsummaryrefslogblamecommitdiffstats
path: root/library/cpp/containers/comptrie/comptrie_impl.h
blob: f41c38311a4fd6a3e7e2ca6537a5e4a25f13bfb5 (plain) (tree)
1
2
3
4
5
6
7
8
            
 
                               
 


                             

                        
                                 
                                    
                                  
                                   
                                                                  
                                                   
                                                                       
                                                                                        




                                      


                                                        





                                                                      
                         

                          






                                                           

                                                   





                                 
                                


                                    
                                     
      




                                     
 
 


                                                        
                                                                             


                                                    
                                                                  


         
                        
                                                                        








                                             

                                                                                
                                                                   










                                                                                            
                                                                                         








                                                                                            
                                            










































                                                                                   
                          

                 


                                                                                           
                                    
                                                                                     
                                              
                                                                                                    
                                                                
                                    





                                                                   
                            
 
                                         





                                                    
                                      












                                                       
 
#pragma once

#include <util/stream/output.h>

#ifndef COMPTRIE_DATA_CHECK
#define COMPTRIE_DATA_CHECK 1
#endif

// NCompactTrie

namespace NCompactTrie {
    const char MT_FINAL = '\x80';
    const char MT_NEXT = '\x40';
    const char MT_SIZEMASK = '\x07';
    const size_t MT_LEFTSHIFT = 3;
    const size_t MT_RIGHTSHIFT = 0;

    Y_FORCE_INLINE size_t UnpackOffset(const char* p, size_t len);
    size_t MeasureOffset(size_t offset);
    size_t PackOffset(char* buffer, size_t offset);
    static inline ui64 ArcSaveOffset(size_t offset, IOutputStream& os);
    Y_FORCE_INLINE char LeapByte(const char*& datapos, const char* dataend, char label);

    template <class T>
    inline static size_t ExtraBits() {
        return (sizeof(T) - 1) * 8;
    }

    static inline bool IsEpsilonLink(const char flags) {
        return !(flags & (MT_FINAL | MT_NEXT));
    }

    static inline void TraverseEpsilon(const char*& datapos) {
        const char flags = *datapos;
        if (!IsEpsilonLink(flags)) {
            return;
        }
        const size_t offsetlength = flags & MT_SIZEMASK;
        const size_t offset = UnpackOffset(datapos + 1, offsetlength);
        Y_ASSERT(offset);
        datapos += offset;
    }

    static inline size_t LeftOffsetLen(const char flags) {
        return (flags >> MT_LEFTSHIFT) & MT_SIZEMASK;
    }

    static inline size_t RightOffsetLen(const char flags) {
        return flags & MT_SIZEMASK;
    }

    void ShowProgress(size_t n); // just print dots
}

namespace NCompTriePrivate {
    template <typename TChar>
    struct TStringForChar {
    };

    template <>
    struct TStringForChar<char> {
        typedef TString TResult;
    };

    template <>
    struct TStringForChar<wchar16> {
        typedef TUtf16String TResult;
    };

    template <>
    struct TStringForChar<wchar32> {
        typedef TUtf32String TResult;
    };

}

namespace NCompTriePrivate {
    struct TCmp {
        template <class T>
        inline bool operator()(const T& l, const T& r) {
            return (unsigned char)(l.Label[0]) < (unsigned char)(r.Label[0]);
        }

        template <class T>
        inline bool operator()(const T& l, char r) {
            return (unsigned char)(l.Label[0]) < (unsigned char)r;
        }
    };
}

namespace NCompactTrie {
    static inline ui64 ArcSaveOffset(size_t offset, IOutputStream& os) {
        using namespace NCompactTrie;

        if (!offset)
            return 0;

        char buf[16];
        size_t len = PackOffset(buf, offset);
        os.Write(buf, len);
        return len;
    }

    // Unpack the offset to the next node. The encoding scheme can store offsets
    // up to 7 bytes; whether they fit into size_t is another issue.
    Y_FORCE_INLINE size_t UnpackOffset(const char* p, size_t len) {
        size_t result = 0;

        while (len--)
            result = ((result << 8) | (*(p++) & 0xFF));

        return result;
    }

    // Auxiliary function: consumes one character from the input. Advances the data pointer
    // to the position immediately preceding the value for the link just traversed (if any);
    // returns flags associated with the link. If no arc with the required label is present,
    // zeroes the data pointer.
    Y_FORCE_INLINE char LeapByte(const char*& datapos, const char* dataend, char label) {
        while (datapos < dataend) {
            size_t offsetlength, offset;
            const char* startpos = datapos;
            char flags = *(datapos++);

            if (IsEpsilonLink(flags)) {
                // Epsilon link - jump to the specified offset without further checks.
                // These links are created during minimization: original uncompressed
                // tree does not need them. (If we find a way to package 3 offset lengths
                // into 1 byte, we could get rid of them; but it looks like they do no harm.
                Y_ASSERT(datapos < dataend);
                offsetlength = flags & MT_SIZEMASK;
                offset = UnpackOffset(datapos, offsetlength);
                if (!offset)
                    break;
                datapos = startpos + offset;

                continue;
            }

            char ch = *(datapos++);

            // Left branch
            offsetlength = LeftOffsetLen(flags);
            if ((unsigned char)label < (unsigned char)ch) {
                offset = UnpackOffset(datapos, offsetlength);
                if (!offset)
                    break;

                datapos = startpos + offset;

                continue;
            }

            datapos += offsetlength;

            // Right branch
            offsetlength = RightOffsetLen(flags);
            if ((unsigned char)label > (unsigned char)ch) {
                offset = UnpackOffset(datapos, offsetlength);

                if (!offset)
                    break;

                datapos = startpos + offset;

                continue;
            }

            // Got a match; return position right before the contents for the label
            datapos += offsetlength;
            return flags;
        }

        // if we got here, we're past the dataend - bail out ASAP
        datapos = nullptr;
        return 0;
    }

    // Auxiliary function: consumes one (multibyte) symbol from the input.
    // Advances the data pointer to the root of the subtrie beginning after the symbol,
    // zeroes it if this subtrie is empty.
    // If there is a value associated with the symbol, makes the value pointer point to it,
    // otherwise sets it to nullptr.
    // Returns true if the symbol was succesfully found in the trie, false otherwise.
    template <typename TSymbol, class TPacker>
    Y_FORCE_INLINE bool Advance(const char*& datapos, const char* const dataend, const char*& value,
                                TSymbol label, TPacker packer) {
        Y_ASSERT(datapos < dataend);
        char flags = MT_NEXT;
        for (int i = (int)ExtraBits<TSymbol>(); i >= 0; i -= 8) {
            flags = LeapByte(datapos, dataend, (char)(label >> i));
            if (!datapos) {
                return false; // no such arc
            }

            value = nullptr;

            Y_ASSERT(datapos <= dataend);
            if ((flags & MT_FINAL)) {
                value = datapos;
                datapos += packer.SkipLeaf(datapos);
            }

            if (!(flags & MT_NEXT)) {
                if (i == 0) {
                    datapos = nullptr;
                    return true;
                }
                return false; // no further way
            }

            TraverseEpsilon(datapos);
            if (i == 0) { // last byte, and got a match
                return true;
            }
        }

        return false;
    }

}