aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/codecs/greedy_dict/gd_entry.cpp
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/codecs/greedy_dict/gd_entry.cpp
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/codecs/greedy_dict/gd_entry.cpp')
-rw-r--r--library/cpp/codecs/greedy_dict/gd_entry.cpp98
1 files changed, 98 insertions, 0 deletions
diff --git a/library/cpp/codecs/greedy_dict/gd_entry.cpp b/library/cpp/codecs/greedy_dict/gd_entry.cpp
new file mode 100644
index 0000000000..2c315c7f7c
--- /dev/null
+++ b/library/cpp/codecs/greedy_dict/gd_entry.cpp
@@ -0,0 +1,98 @@
+#include "gd_entry.h"
+#include "gd_stats.h"
+
+#include <util/generic/algorithm.h>
+#include <util/generic/singleton.h>
+
+namespace NGreedyDict {
+ class TAlphas {
+ char Memory[512];
+
+ public:
+ TStringBufs Alphas;
+
+ TAlphas() {
+ for (ui32 i = 0; i < 256; ++i) {
+ Memory[2 * i] = (char)i;
+ Memory[2 * i + 1] = 0;
+
+ Alphas.push_back(TStringBuf(&Memory[2 * i], 1));
+ }
+ }
+ };
+
+ void TEntrySet::InitWithAlpha() {
+ Pool.ClearKeepFirstChunk();
+ const TStringBufs& a = Singleton<TAlphas>()->Alphas;
+ for (auto it : a) {
+ Add(it);
+ }
+ BuildHierarchy();
+ }
+
+ void TEntrySet::BuildHierarchy() {
+ Sort(begin(), end(), TEntry::StrLess);
+
+ TCompactTrieBuilder<char, ui32, TAsIsPacker<ui32>> builder(CTBF_PREFIX_GROUPED);
+
+ for (iterator it = begin(); it != end(); ++it) {
+ it->Number = (it - begin());
+ TStringBuf suff = it->Str;
+ size_t len = 0;
+ ui32 val = 0;
+
+ if (builder.FindLongestPrefix(suff.data(), suff.size(), &len, &val) && len) {
+ it->NearestPrefix = val;
+ }
+
+ builder.Add(suff.data(), suff.size(), it->Number);
+ }
+
+ TBufferOutput bout;
+ builder.Save(bout);
+ Trie.Init(TBlob::FromBuffer(bout.Buffer()));
+ }
+
+ TEntry* TEntrySet::FindPrefix(TStringBuf& str) {
+ size_t len = 0;
+ ui32 off = 0;
+
+ if (!Trie.FindLongestPrefix(str, &len, &off)) {
+ return nullptr;
+ }
+
+ str.Skip(len);
+ return &Get(off);
+ }
+
+ void TEntrySet::SetModelP() {
+ for (iterator it = begin(); it != end(); ++it) {
+ TEntry& e = *it;
+
+ if (!e.HasPrefix()) {
+ e.ModelP = 0;
+ continue;
+ }
+
+ TStringBuf suff = e.Str;
+ const TEntry& p = Get(e.NearestPrefix);
+ suff.Skip(p.Len());
+
+ float modelp = float(p.Count + e.Count) / TotalCount;
+
+ while (!!suff) {
+ TEntry* pp = FindPrefix(suff);
+ modelp *= float(pp->Count + e.Count) / TotalCount;
+ }
+
+ e.ModelP = modelp;
+ }
+ }
+
+ void TEntrySet::SetScores(EEntryScore s) {
+ for (auto& it : *this) {
+ it.Score = Score(s, it.Len(), it.ModelP, it.Count, TotalCount);
+ }
+ }
+
+}