aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/codecs/greedy_dict/gd_builder.cpp
diff options
context:
space:
mode:
authorAnton Samokhvalov <pg83@yandex.ru>2022-02-10 16:45:15 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:45:15 +0300
commit72cb13b4aff9bc9cf22e49251bc8fd143f82538f (patch)
treeda2c34829458c7d4e74bdfbdf85dff449e9e7fb8 /library/cpp/codecs/greedy_dict/gd_builder.cpp
parent778e51ba091dc39e7b7fcab2b9cf4dbedfb6f2b5 (diff)
downloadydb-72cb13b4aff9bc9cf22e49251bc8fd143f82538f.tar.gz
Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/codecs/greedy_dict/gd_builder.cpp')
-rw-r--r--library/cpp/codecs/greedy_dict/gd_builder.cpp190
1 files changed, 95 insertions, 95 deletions
diff --git a/library/cpp/codecs/greedy_dict/gd_builder.cpp b/library/cpp/codecs/greedy_dict/gd_builder.cpp
index 561bfbca01..802c721753 100644
--- a/library/cpp/codecs/greedy_dict/gd_builder.cpp
+++ b/library/cpp/codecs/greedy_dict/gd_builder.cpp
@@ -9,134 +9,134 @@
#include <util/system/rusage.h>
namespace NGreedyDict {
- void TDictBuilder::RebuildCounts(ui32 maxcand, bool final) {
- if (!Current) {
+ void TDictBuilder::RebuildCounts(ui32 maxcand, bool final) {
+ if (!Current) {
Current = MakeHolder<TEntrySet>();
- Current->InitWithAlpha();
- }
+ Current->InitWithAlpha();
+ }
- TEntrySet& set = *Current;
+ TEntrySet& set = *Current;
- for (auto& it : set)
- it.Count = 0;
+ for (auto& it : set)
+ it.Count = 0;
- CompoundCounts = nullptr;
- CompoundCountsPool.Clear();
+ CompoundCounts = nullptr;
+ CompoundCountsPool.Clear();
- if (!final) {
+ if (!final) {
CompoundCounts = MakeHolder<TCompoundCounts>(&CompoundCountsPool);
- CompoundCounts->reserve(maxcand);
- }
+ CompoundCounts->reserve(maxcand);
+ }
- Shuffle(Input.begin(), Input.end(), Rng);
+ Shuffle(Input.begin(), Input.end(), Rng);
- for (auto str : Input) {
- if (!final && CompoundCounts->size() > maxcand)
- break;
+ for (auto str : Input) {
+ if (!final && CompoundCounts->size() > maxcand)
+ break;
- i32 prev = -1;
+ i32 prev = -1;
- while (!!str) {
- TEntry* e = set.FindPrefix(str);
- ui32 num = e->Number;
+ while (!!str) {
+ TEntry* e = set.FindPrefix(str);
+ ui32 num = e->Number;
- e->Count += 1;
- if (!final && prev >= 0) {
- (*CompoundCounts)[Compose(prev, num)] += 1;
- }
+ e->Count += 1;
+ if (!final && prev >= 0) {
+ (*CompoundCounts)[Compose(prev, num)] += 1;
+ }
- prev = num;
- ++set.TotalCount;
+ prev = num;
+ ++set.TotalCount;
}
- }
+ }
- Current->SetModelP();
+ Current->SetModelP();
}
- ui32 TDictBuilder::BuildNextGeneration(ui32 maxent) {
- TAutoPtr<TEntrySet> newset = new TEntrySet;
- newset->InitWithAlpha();
- maxent -= newset->size();
-
- ui32 additions = 0;
- ui32 deletions = 0;
-
- {
- const TEntrySet& set = *Current;
-
- Candidates.clear();
- const ui32 total = set.TotalCount;
- const float minpval = Settings.MinPValue;
- const EEntryStatTest test = Settings.StatTest;
- const EEntryScore score = Settings.Score;
- const ui32 mincnt = Settings.MinAbsCount;
-
- for (const auto& it : set) {
- const TEntry& e = it;
- float modelp = e.ModelP;
- ui32 cnt = e.Count;
-
- if (e.HasPrefix() && e.Count > mincnt && StatTest(test, modelp, cnt, total) > minpval)
- Candidates.push_back(TCandidate(-Score(score, e.Len(), modelp, cnt, total), it.Number));
- }
-
- if (!!CompoundCounts) {
- for (TCompoundCounts::const_iterator it = CompoundCounts->begin(); it != CompoundCounts->end(); ++it) {
- const TEntry& prev = set.Get(Prev(it->first));
- const TEntry& next = set.Get(Next(it->first));
- float modelp = ModelP(prev.Count, next.Count, total);
- ui32 cnt = it->second;
- if (cnt > mincnt && StatTest(test, modelp, cnt, total) > minpval)
- Candidates.push_back(TCandidate(-Score(score, prev.Len() + next.Len(), modelp, cnt, total), it->first));
- }
+ ui32 TDictBuilder::BuildNextGeneration(ui32 maxent) {
+ TAutoPtr<TEntrySet> newset = new TEntrySet;
+ newset->InitWithAlpha();
+ maxent -= newset->size();
+
+ ui32 additions = 0;
+ ui32 deletions = 0;
+
+ {
+ const TEntrySet& set = *Current;
+
+ Candidates.clear();
+ const ui32 total = set.TotalCount;
+ const float minpval = Settings.MinPValue;
+ const EEntryStatTest test = Settings.StatTest;
+ const EEntryScore score = Settings.Score;
+ const ui32 mincnt = Settings.MinAbsCount;
+
+ for (const auto& it : set) {
+ const TEntry& e = it;
+ float modelp = e.ModelP;
+ ui32 cnt = e.Count;
+
+ if (e.HasPrefix() && e.Count > mincnt && StatTest(test, modelp, cnt, total) > minpval)
+ Candidates.push_back(TCandidate(-Score(score, e.Len(), modelp, cnt, total), it.Number));
+ }
+
+ if (!!CompoundCounts) {
+ for (TCompoundCounts::const_iterator it = CompoundCounts->begin(); it != CompoundCounts->end(); ++it) {
+ const TEntry& prev = set.Get(Prev(it->first));
+ const TEntry& next = set.Get(Next(it->first));
+ float modelp = ModelP(prev.Count, next.Count, total);
+ ui32 cnt = it->second;
+ if (cnt > mincnt && StatTest(test, modelp, cnt, total) > minpval)
+ Candidates.push_back(TCandidate(-Score(score, prev.Len() + next.Len(), modelp, cnt, total), it->first));
+ }
}
- Sort(Candidates.begin(), Candidates.end());
+ Sort(Candidates.begin(), Candidates.end());
- if (Candidates.size() > maxent)
- Candidates.resize(maxent);
+ if (Candidates.size() > maxent)
+ Candidates.resize(maxent);
- for (const auto& candidate : Candidates) {
- if (IsCompound(candidate.second)) {
- additions++;
- newset->Add(set.Get(Prev(candidate.second)).Str, set.Get(Next(candidate.second)).Str);
- } else {
- newset->Add(set.Get(candidate.second).Str);
- }
+ for (const auto& candidate : Candidates) {
+ if (IsCompound(candidate.second)) {
+ additions++;
+ newset->Add(set.Get(Prev(candidate.second)).Str, set.Get(Next(candidate.second)).Str);
+ } else {
+ newset->Add(set.Get(candidate.second).Str);
+ }
}
-
- deletions = set.size() - (newset->size() - additions);
+
+ deletions = set.size() - (newset->size() - additions);
}
- Current = newset;
- Current->BuildHierarchy();
- return deletions + additions;
+ Current = newset;
+ Current->BuildHierarchy();
+ return deletions + additions;
}
- ui32 TDictBuilder::Build(ui32 maxentries, ui32 maxiters, ui32 mindiff) {
- size_t totalsz = 0;
- for (auto it : Input)
+ ui32 TDictBuilder::Build(ui32 maxentries, ui32 maxiters, ui32 mindiff) {
+ size_t totalsz = 0;
+ for (auto it : Input)
totalsz += it.size();
- while (maxiters) {
- maxiters--;
+ while (maxiters) {
+ maxiters--;
- RebuildCounts(maxentries * Settings.GrowLimit, false);
+ RebuildCounts(maxentries * Settings.GrowLimit, false);
- if (Settings.Verbose) {
- TString mess = Sprintf("iter:%" PRIu32 " sz:%" PRIu32 " pend:%" PRIu32, maxiters, (ui32)Current->size(), (ui32)CompoundCounts->size());
+ if (Settings.Verbose) {
+ TString mess = Sprintf("iter:%" PRIu32 " sz:%" PRIu32 " pend:%" PRIu32, maxiters, (ui32)Current->size(), (ui32)CompoundCounts->size());
Clog << Sprintf("%-110s RSS=%" PRIu32 "M", mess.data(), (ui32)(TRusage::Get().MaxRss >> 20)) << Endl;
- }
-
- ui32 diff = BuildNextGeneration(maxentries);
+ }
- if (Current->size() == maxentries && diff < mindiff)
- break;
+ ui32 diff = BuildNextGeneration(maxentries);
+
+ if (Current->size() == maxentries && diff < mindiff)
+ break;
}
- RebuildCounts(0, true);
- Current->SetScores(Settings.Score);
- return maxiters;
+ RebuildCounts(0, true);
+ Current->SetScores(Settings.Score);
+ return maxiters;
}
}