summaryrefslogtreecommitdiffstats
path: root/library/cpp/containers/comptrie/comptrie_impl.h
diff options
context:
space:
mode:
authorDevtools Arcadia <[email protected]>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <[email protected]>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/containers/comptrie/comptrie_impl.h
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/containers/comptrie/comptrie_impl.h')
-rw-r--r--library/cpp/containers/comptrie/comptrie_impl.h221
1 files changed, 221 insertions, 0 deletions
diff --git a/library/cpp/containers/comptrie/comptrie_impl.h b/library/cpp/containers/comptrie/comptrie_impl.h
new file mode 100644
index 00000000000..f41c38311a4
--- /dev/null
+++ b/library/cpp/containers/comptrie/comptrie_impl.h
@@ -0,0 +1,221 @@
+#pragma once
+
+#include <util/stream/output.h>
+
+#ifndef COMPTRIE_DATA_CHECK
+#define COMPTRIE_DATA_CHECK 1
+#endif
+
+// NCompactTrie
+
+namespace NCompactTrie {
+ const char MT_FINAL = '\x80';
+ const char MT_NEXT = '\x40';
+ const char MT_SIZEMASK = '\x07';
+ const size_t MT_LEFTSHIFT = 3;
+ const size_t MT_RIGHTSHIFT = 0;
+
+ Y_FORCE_INLINE size_t UnpackOffset(const char* p, size_t len);
+ size_t MeasureOffset(size_t offset);
+ size_t PackOffset(char* buffer, size_t offset);
+ static inline ui64 ArcSaveOffset(size_t offset, IOutputStream& os);
+ Y_FORCE_INLINE char LeapByte(const char*& datapos, const char* dataend, char label);
+
+ template <class T>
+ inline static size_t ExtraBits() {
+ return (sizeof(T) - 1) * 8;
+ }
+
+ static inline bool IsEpsilonLink(const char flags) {
+ return !(flags & (MT_FINAL | MT_NEXT));
+ }
+
+ static inline void TraverseEpsilon(const char*& datapos) {
+ const char flags = *datapos;
+ if (!IsEpsilonLink(flags)) {
+ return;
+ }
+ const size_t offsetlength = flags & MT_SIZEMASK;
+ const size_t offset = UnpackOffset(datapos + 1, offsetlength);
+ Y_ASSERT(offset);
+ datapos += offset;
+ }
+
+ static inline size_t LeftOffsetLen(const char flags) {
+ return (flags >> MT_LEFTSHIFT) & MT_SIZEMASK;
+ }
+
+ static inline size_t RightOffsetLen(const char flags) {
+ return flags & MT_SIZEMASK;
+ }
+
+ void ShowProgress(size_t n); // just print dots
+}
+
+namespace NCompTriePrivate {
+ template <typename TChar>
+ struct TStringForChar {
+ };
+
+ template <>
+ struct TStringForChar<char> {
+ typedef TString TResult;
+ };
+
+ template <>
+ struct TStringForChar<wchar16> {
+ typedef TUtf16String TResult;
+ };
+
+ template <>
+ struct TStringForChar<wchar32> {
+ typedef TUtf32String TResult;
+ };
+
+}
+
+namespace NCompTriePrivate {
+ struct TCmp {
+ template <class T>
+ inline bool operator()(const T& l, const T& r) {
+ return (unsigned char)(l.Label[0]) < (unsigned char)(r.Label[0]);
+ }
+
+ template <class T>
+ inline bool operator()(const T& l, char r) {
+ return (unsigned char)(l.Label[0]) < (unsigned char)r;
+ }
+ };
+}
+
+namespace NCompactTrie {
+ static inline ui64 ArcSaveOffset(size_t offset, IOutputStream& os) {
+ using namespace NCompactTrie;
+
+ if (!offset)
+ return 0;
+
+ char buf[16];
+ size_t len = PackOffset(buf, offset);
+ os.Write(buf, len);
+ return len;
+ }
+
+ // Unpack the offset to the next node. The encoding scheme can store offsets
+ // up to 7 bytes; whether they fit into size_t is another issue.
+ Y_FORCE_INLINE size_t UnpackOffset(const char* p, size_t len) {
+ size_t result = 0;
+
+ while (len--)
+ result = ((result << 8) | (*(p++) & 0xFF));
+
+ return result;
+ }
+
+ // Auxiliary function: consumes one character from the input. Advances the data pointer
+ // to the position immediately preceding the value for the link just traversed (if any);
+ // returns flags associated with the link. If no arc with the required label is present,
+ // zeroes the data pointer.
+ Y_FORCE_INLINE char LeapByte(const char*& datapos, const char* dataend, char label) {
+ while (datapos < dataend) {
+ size_t offsetlength, offset;
+ const char* startpos = datapos;
+ char flags = *(datapos++);
+
+ if (IsEpsilonLink(flags)) {
+ // Epsilon link - jump to the specified offset without further checks.
+ // These links are created during minimization: original uncompressed
+ // tree does not need them. (If we find a way to package 3 offset lengths
+ // into 1 byte, we could get rid of them; but it looks like they do no harm.
+ Y_ASSERT(datapos < dataend);
+ offsetlength = flags & MT_SIZEMASK;
+ offset = UnpackOffset(datapos, offsetlength);
+ if (!offset)
+ break;
+ datapos = startpos + offset;
+
+ continue;
+ }
+
+ char ch = *(datapos++);
+
+ // Left branch
+ offsetlength = LeftOffsetLen(flags);
+ if ((unsigned char)label < (unsigned char)ch) {
+ offset = UnpackOffset(datapos, offsetlength);
+ if (!offset)
+ break;
+
+ datapos = startpos + offset;
+
+ continue;
+ }
+
+ datapos += offsetlength;
+
+ // Right branch
+ offsetlength = RightOffsetLen(flags);
+ if ((unsigned char)label > (unsigned char)ch) {
+ offset = UnpackOffset(datapos, offsetlength);
+
+ if (!offset)
+ break;
+
+ datapos = startpos + offset;
+
+ continue;
+ }
+
+ // Got a match; return position right before the contents for the label
+ datapos += offsetlength;
+ return flags;
+ }
+
+ // if we got here, we're past the dataend - bail out ASAP
+ datapos = nullptr;
+ return 0;
+ }
+
+ // Auxiliary function: consumes one (multibyte) symbol from the input.
+ // Advances the data pointer to the root of the subtrie beginning after the symbol,
+ // zeroes it if this subtrie is empty.
+ // If there is a value associated with the symbol, makes the value pointer point to it,
+ // otherwise sets it to nullptr.
+ // Returns true if the symbol was succesfully found in the trie, false otherwise.
+ template <typename TSymbol, class TPacker>
+ Y_FORCE_INLINE bool Advance(const char*& datapos, const char* const dataend, const char*& value,
+ TSymbol label, TPacker packer) {
+ Y_ASSERT(datapos < dataend);
+ char flags = MT_NEXT;
+ for (int i = (int)ExtraBits<TSymbol>(); i >= 0; i -= 8) {
+ flags = LeapByte(datapos, dataend, (char)(label >> i));
+ if (!datapos) {
+ return false; // no such arc
+ }
+
+ value = nullptr;
+
+ Y_ASSERT(datapos <= dataend);
+ if ((flags & MT_FINAL)) {
+ value = datapos;
+ datapos += packer.SkipLeaf(datapos);
+ }
+
+ if (!(flags & MT_NEXT)) {
+ if (i == 0) {
+ datapos = nullptr;
+ return true;
+ }
+ return false; // no further way
+ }
+
+ TraverseEpsilon(datapos);
+ if (i == 0) { // last byte, and got a match
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+}