diff options
author | Anton Samokhvalov <pg83@yandex.ru> | 2022-02-10 16:45:15 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:15 +0300 |
commit | 72cb13b4aff9bc9cf22e49251bc8fd143f82538f (patch) | |
tree | da2c34829458c7d4e74bdfbdf85dff449e9e7fb8 /library/cpp/codecs/greedy_dict/gd_builder.cpp | |
parent | 778e51ba091dc39e7b7fcab2b9cf4dbedfb6f2b5 (diff) | |
download | ydb-72cb13b4aff9bc9cf22e49251bc8fd143f82538f.tar.gz |
Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/codecs/greedy_dict/gd_builder.cpp')
-rw-r--r-- | library/cpp/codecs/greedy_dict/gd_builder.cpp | 190 |
1 files changed, 95 insertions, 95 deletions
diff --git a/library/cpp/codecs/greedy_dict/gd_builder.cpp b/library/cpp/codecs/greedy_dict/gd_builder.cpp index 561bfbca01..802c721753 100644 --- a/library/cpp/codecs/greedy_dict/gd_builder.cpp +++ b/library/cpp/codecs/greedy_dict/gd_builder.cpp @@ -9,134 +9,134 @@ #include <util/system/rusage.h> namespace NGreedyDict { - void TDictBuilder::RebuildCounts(ui32 maxcand, bool final) { - if (!Current) { + void TDictBuilder::RebuildCounts(ui32 maxcand, bool final) { + if (!Current) { Current = MakeHolder<TEntrySet>(); - Current->InitWithAlpha(); - } + Current->InitWithAlpha(); + } - TEntrySet& set = *Current; + TEntrySet& set = *Current; - for (auto& it : set) - it.Count = 0; + for (auto& it : set) + it.Count = 0; - CompoundCounts = nullptr; - CompoundCountsPool.Clear(); + CompoundCounts = nullptr; + CompoundCountsPool.Clear(); - if (!final) { + if (!final) { CompoundCounts = MakeHolder<TCompoundCounts>(&CompoundCountsPool); - CompoundCounts->reserve(maxcand); - } + CompoundCounts->reserve(maxcand); + } - Shuffle(Input.begin(), Input.end(), Rng); + Shuffle(Input.begin(), Input.end(), Rng); - for (auto str : Input) { - if (!final && CompoundCounts->size() > maxcand) - break; + for (auto str : Input) { + if (!final && CompoundCounts->size() > maxcand) + break; - i32 prev = -1; + i32 prev = -1; - while (!!str) { - TEntry* e = set.FindPrefix(str); - ui32 num = e->Number; + while (!!str) { + TEntry* e = set.FindPrefix(str); + ui32 num = e->Number; - e->Count += 1; - if (!final && prev >= 0) { - (*CompoundCounts)[Compose(prev, num)] += 1; - } + e->Count += 1; + if (!final && prev >= 0) { + (*CompoundCounts)[Compose(prev, num)] += 1; + } - prev = num; - ++set.TotalCount; + prev = num; + ++set.TotalCount; } - } + } - Current->SetModelP(); + Current->SetModelP(); } - ui32 TDictBuilder::BuildNextGeneration(ui32 maxent) { - TAutoPtr<TEntrySet> newset = new TEntrySet; - newset->InitWithAlpha(); - maxent -= newset->size(); - - ui32 additions = 0; - ui32 deletions = 0; - - { - const TEntrySet& set = *Current; - - Candidates.clear(); - const ui32 total = set.TotalCount; - const float minpval = Settings.MinPValue; - const EEntryStatTest test = Settings.StatTest; - const EEntryScore score = Settings.Score; - const ui32 mincnt = Settings.MinAbsCount; - - for (const auto& it : set) { - const TEntry& e = it; - float modelp = e.ModelP; - ui32 cnt = e.Count; - - if (e.HasPrefix() && e.Count > mincnt && StatTest(test, modelp, cnt, total) > minpval) - Candidates.push_back(TCandidate(-Score(score, e.Len(), modelp, cnt, total), it.Number)); - } - - if (!!CompoundCounts) { - for (TCompoundCounts::const_iterator it = CompoundCounts->begin(); it != CompoundCounts->end(); ++it) { - const TEntry& prev = set.Get(Prev(it->first)); - const TEntry& next = set.Get(Next(it->first)); - float modelp = ModelP(prev.Count, next.Count, total); - ui32 cnt = it->second; - if (cnt > mincnt && StatTest(test, modelp, cnt, total) > minpval) - Candidates.push_back(TCandidate(-Score(score, prev.Len() + next.Len(), modelp, cnt, total), it->first)); - } + ui32 TDictBuilder::BuildNextGeneration(ui32 maxent) { + TAutoPtr<TEntrySet> newset = new TEntrySet; + newset->InitWithAlpha(); + maxent -= newset->size(); + + ui32 additions = 0; + ui32 deletions = 0; + + { + const TEntrySet& set = *Current; + + Candidates.clear(); + const ui32 total = set.TotalCount; + const float minpval = Settings.MinPValue; + const EEntryStatTest test = Settings.StatTest; + const EEntryScore score = Settings.Score; + const ui32 mincnt = Settings.MinAbsCount; + + for (const auto& it : set) { + const TEntry& e = it; + float modelp = e.ModelP; + ui32 cnt = e.Count; + + if (e.HasPrefix() && e.Count > mincnt && StatTest(test, modelp, cnt, total) > minpval) + Candidates.push_back(TCandidate(-Score(score, e.Len(), modelp, cnt, total), it.Number)); + } + + if (!!CompoundCounts) { + for (TCompoundCounts::const_iterator it = CompoundCounts->begin(); it != CompoundCounts->end(); ++it) { + const TEntry& prev = set.Get(Prev(it->first)); + const TEntry& next = set.Get(Next(it->first)); + float modelp = ModelP(prev.Count, next.Count, total); + ui32 cnt = it->second; + if (cnt > mincnt && StatTest(test, modelp, cnt, total) > minpval) + Candidates.push_back(TCandidate(-Score(score, prev.Len() + next.Len(), modelp, cnt, total), it->first)); + } } - Sort(Candidates.begin(), Candidates.end()); + Sort(Candidates.begin(), Candidates.end()); - if (Candidates.size() > maxent) - Candidates.resize(maxent); + if (Candidates.size() > maxent) + Candidates.resize(maxent); - for (const auto& candidate : Candidates) { - if (IsCompound(candidate.second)) { - additions++; - newset->Add(set.Get(Prev(candidate.second)).Str, set.Get(Next(candidate.second)).Str); - } else { - newset->Add(set.Get(candidate.second).Str); - } + for (const auto& candidate : Candidates) { + if (IsCompound(candidate.second)) { + additions++; + newset->Add(set.Get(Prev(candidate.second)).Str, set.Get(Next(candidate.second)).Str); + } else { + newset->Add(set.Get(candidate.second).Str); + } } - - deletions = set.size() - (newset->size() - additions); + + deletions = set.size() - (newset->size() - additions); } - Current = newset; - Current->BuildHierarchy(); - return deletions + additions; + Current = newset; + Current->BuildHierarchy(); + return deletions + additions; } - ui32 TDictBuilder::Build(ui32 maxentries, ui32 maxiters, ui32 mindiff) { - size_t totalsz = 0; - for (auto it : Input) + ui32 TDictBuilder::Build(ui32 maxentries, ui32 maxiters, ui32 mindiff) { + size_t totalsz = 0; + for (auto it : Input) totalsz += it.size(); - while (maxiters) { - maxiters--; + while (maxiters) { + maxiters--; - RebuildCounts(maxentries * Settings.GrowLimit, false); + RebuildCounts(maxentries * Settings.GrowLimit, false); - if (Settings.Verbose) { - TString mess = Sprintf("iter:%" PRIu32 " sz:%" PRIu32 " pend:%" PRIu32, maxiters, (ui32)Current->size(), (ui32)CompoundCounts->size()); + if (Settings.Verbose) { + TString mess = Sprintf("iter:%" PRIu32 " sz:%" PRIu32 " pend:%" PRIu32, maxiters, (ui32)Current->size(), (ui32)CompoundCounts->size()); Clog << Sprintf("%-110s RSS=%" PRIu32 "M", mess.data(), (ui32)(TRusage::Get().MaxRss >> 20)) << Endl; - } - - ui32 diff = BuildNextGeneration(maxentries); + } - if (Current->size() == maxentries && diff < mindiff) - break; + ui32 diff = BuildNextGeneration(maxentries); + + if (Current->size() == maxentries && diff < mindiff) + break; } - RebuildCounts(0, true); - Current->SetScores(Settings.Score); - return maxiters; + RebuildCounts(0, true); + Current->SetScores(Settings.Score); + return maxiters; } } |