aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/codecs/greedy_dict/gd_entry.cpp
diff options
context:
space:
mode:
authorRuslan Kovalev <ruslan.a.kovalev@gmail.com>2022-02-10 16:46:45 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:46:45 +0300
commit9123176b341b6f2658cff5132482b8237c1416c8 (patch)
tree49e222ea1c5804306084bb3ae065bb702625360f /library/cpp/codecs/greedy_dict/gd_entry.cpp
parent59e19371de37995fcb36beb16cd6ec030af960bc (diff)
downloadydb-9123176b341b6f2658cff5132482b8237c1416c8.tar.gz
Restoring authorship annotation for Ruslan Kovalev <ruslan.a.kovalev@gmail.com>. Commit 2 of 2.
Diffstat (limited to 'library/cpp/codecs/greedy_dict/gd_entry.cpp')
-rw-r--r--library/cpp/codecs/greedy_dict/gd_entry.cpp74
1 files changed, 37 insertions, 37 deletions
diff --git a/library/cpp/codecs/greedy_dict/gd_entry.cpp b/library/cpp/codecs/greedy_dict/gd_entry.cpp
index f23a754976..2c315c7f7c 100644
--- a/library/cpp/codecs/greedy_dict/gd_entry.cpp
+++ b/library/cpp/codecs/greedy_dict/gd_entry.cpp
@@ -1,98 +1,98 @@
-#include "gd_entry.h"
-#include "gd_stats.h"
-
-#include <util/generic/algorithm.h>
-#include <util/generic/singleton.h>
-
-namespace NGreedyDict {
+#include "gd_entry.h"
+#include "gd_stats.h"
+
+#include <util/generic/algorithm.h>
+#include <util/generic/singleton.h>
+
+namespace NGreedyDict {
class TAlphas {
char Memory[512];
-
+
public:
TStringBufs Alphas;
-
+
TAlphas() {
for (ui32 i = 0; i < 256; ++i) {
Memory[2 * i] = (char)i;
Memory[2 * i + 1] = 0;
-
+
Alphas.push_back(TStringBuf(&Memory[2 * i], 1));
}
}
};
-
+
void TEntrySet::InitWithAlpha() {
Pool.ClearKeepFirstChunk();
const TStringBufs& a = Singleton<TAlphas>()->Alphas;
for (auto it : a) {
Add(it);
- }
+ }
BuildHierarchy();
- }
-
+ }
+
void TEntrySet::BuildHierarchy() {
Sort(begin(), end(), TEntry::StrLess);
-
+
TCompactTrieBuilder<char, ui32, TAsIsPacker<ui32>> builder(CTBF_PREFIX_GROUPED);
-
+
for (iterator it = begin(); it != end(); ++it) {
it->Number = (it - begin());
TStringBuf suff = it->Str;
size_t len = 0;
ui32 val = 0;
-
+
if (builder.FindLongestPrefix(suff.data(), suff.size(), &len, &val) && len) {
it->NearestPrefix = val;
}
-
+
builder.Add(suff.data(), suff.size(), it->Number);
- }
-
+ }
+
TBufferOutput bout;
builder.Save(bout);
Trie.Init(TBlob::FromBuffer(bout.Buffer()));
- }
-
+ }
+
TEntry* TEntrySet::FindPrefix(TStringBuf& str) {
size_t len = 0;
ui32 off = 0;
-
+
if (!Trie.FindLongestPrefix(str, &len, &off)) {
return nullptr;
}
-
+
str.Skip(len);
return &Get(off);
- }
-
+ }
+
void TEntrySet::SetModelP() {
for (iterator it = begin(); it != end(); ++it) {
TEntry& e = *it;
-
+
if (!e.HasPrefix()) {
e.ModelP = 0;
continue;
}
-
+
TStringBuf suff = e.Str;
const TEntry& p = Get(e.NearestPrefix);
suff.Skip(p.Len());
-
+
float modelp = float(p.Count + e.Count) / TotalCount;
-
+
while (!!suff) {
TEntry* pp = FindPrefix(suff);
modelp *= float(pp->Count + e.Count) / TotalCount;
}
-
+
e.ModelP = modelp;
- }
- }
-
+ }
+ }
+
void TEntrySet::SetScores(EEntryScore s) {
for (auto& it : *this) {
it.Score = Score(s, it.Len(), it.ModelP, it.Count, TotalCount);
}
- }
-
-}
+ }
+
+}