aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/codecs/greedy_dict/gd_builder.cpp
diff options
context:
space:
mode:
authorRuslan Kovalev <ruslan.a.kovalev@gmail.com>2022-02-10 16:46:45 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:46:45 +0300
commit9123176b341b6f2658cff5132482b8237c1416c8 (patch)
tree49e222ea1c5804306084bb3ae065bb702625360f /library/cpp/codecs/greedy_dict/gd_builder.cpp
parent59e19371de37995fcb36beb16cd6ec030af960bc (diff)
downloadydb-9123176b341b6f2658cff5132482b8237c1416c8.tar.gz
Restoring authorship annotation for Ruslan Kovalev <ruslan.a.kovalev@gmail.com>. Commit 2 of 2.
Diffstat (limited to 'library/cpp/codecs/greedy_dict/gd_builder.cpp')
-rw-r--r--library/cpp/codecs/greedy_dict/gd_builder.cpp94
1 files changed, 47 insertions, 47 deletions
diff --git a/library/cpp/codecs/greedy_dict/gd_builder.cpp b/library/cpp/codecs/greedy_dict/gd_builder.cpp
index 2fb46029bf..561bfbca01 100644
--- a/library/cpp/codecs/greedy_dict/gd_builder.cpp
+++ b/library/cpp/codecs/greedy_dict/gd_builder.cpp
@@ -1,85 +1,85 @@
-#include "gd_builder.h"
-
+#include "gd_builder.h"
+
#include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h>
-#include <util/generic/algorithm.h>
-
-#include <util/random/shuffle.h>
+#include <util/generic/algorithm.h>
+
+#include <util/random/shuffle.h>
#include <util/stream/output.h>
-#include <util/string/printf.h>
-#include <util/system/rusage.h>
-
-namespace NGreedyDict {
+#include <util/string/printf.h>
+#include <util/system/rusage.h>
+
+namespace NGreedyDict {
void TDictBuilder::RebuildCounts(ui32 maxcand, bool final) {
if (!Current) {
Current = MakeHolder<TEntrySet>();
Current->InitWithAlpha();
}
-
+
TEntrySet& set = *Current;
-
+
for (auto& it : set)
it.Count = 0;
-
+
CompoundCounts = nullptr;
CompoundCountsPool.Clear();
-
+
if (!final) {
CompoundCounts = MakeHolder<TCompoundCounts>(&CompoundCountsPool);
CompoundCounts->reserve(maxcand);
}
-
+
Shuffle(Input.begin(), Input.end(), Rng);
-
+
for (auto str : Input) {
if (!final && CompoundCounts->size() > maxcand)
break;
-
+
i32 prev = -1;
-
+
while (!!str) {
TEntry* e = set.FindPrefix(str);
ui32 num = e->Number;
-
+
e->Count += 1;
if (!final && prev >= 0) {
(*CompoundCounts)[Compose(prev, num)] += 1;
}
-
+
prev = num;
++set.TotalCount;
- }
+ }
}
-
+
Current->SetModelP();
- }
-
+ }
+
ui32 TDictBuilder::BuildNextGeneration(ui32 maxent) {
TAutoPtr<TEntrySet> newset = new TEntrySet;
newset->InitWithAlpha();
maxent -= newset->size();
-
+
ui32 additions = 0;
ui32 deletions = 0;
-
+
{
const TEntrySet& set = *Current;
-
+
Candidates.clear();
const ui32 total = set.TotalCount;
const float minpval = Settings.MinPValue;
const EEntryStatTest test = Settings.StatTest;
const EEntryScore score = Settings.Score;
const ui32 mincnt = Settings.MinAbsCount;
-
+
for (const auto& it : set) {
const TEntry& e = it;
float modelp = e.ModelP;
ui32 cnt = e.Count;
-
+
if (e.HasPrefix() && e.Count > mincnt && StatTest(test, modelp, cnt, total) > minpval)
Candidates.push_back(TCandidate(-Score(score, e.Len(), modelp, cnt, total), it.Number));
}
-
+
if (!!CompoundCounts) {
for (TCompoundCounts::const_iterator it = CompoundCounts->begin(); it != CompoundCounts->end(); ++it) {
const TEntry& prev = set.Get(Prev(it->first));
@@ -89,13 +89,13 @@ namespace NGreedyDict {
if (cnt > mincnt && StatTest(test, modelp, cnt, total) > minpval)
Candidates.push_back(TCandidate(-Score(score, prev.Len() + next.Len(), modelp, cnt, total), it->first));
}
- }
-
+ }
+
Sort(Candidates.begin(), Candidates.end());
-
+
if (Candidates.size() > maxent)
Candidates.resize(maxent);
-
+
for (const auto& candidate : Candidates) {
if (IsCompound(candidate.second)) {
additions++;
@@ -103,40 +103,40 @@ namespace NGreedyDict {
} else {
newset->Add(set.Get(candidate.second).Str);
}
- }
+ }
deletions = set.size() - (newset->size() - additions);
- }
-
+ }
+
Current = newset;
Current->BuildHierarchy();
return deletions + additions;
- }
-
+ }
+
ui32 TDictBuilder::Build(ui32 maxentries, ui32 maxiters, ui32 mindiff) {
size_t totalsz = 0;
for (auto it : Input)
totalsz += it.size();
-
+
while (maxiters) {
maxiters--;
-
+
RebuildCounts(maxentries * Settings.GrowLimit, false);
-
+
if (Settings.Verbose) {
TString mess = Sprintf("iter:%" PRIu32 " sz:%" PRIu32 " pend:%" PRIu32, maxiters, (ui32)Current->size(), (ui32)CompoundCounts->size());
Clog << Sprintf("%-110s RSS=%" PRIu32 "M", mess.data(), (ui32)(TRusage::Get().MaxRss >> 20)) << Endl;
}
-
+
ui32 diff = BuildNextGeneration(maxentries);
if (Current->size() == maxentries && diff < mindiff)
break;
- }
-
+ }
+
RebuildCounts(0, true);
Current->SetScores(Settings.Score);
return maxiters;
- }
-
-}
+ }
+
+}