diff options
author | alzobnin <alzobnin@yandex-team.com> | 2022-12-15 18:44:25 +0300 |
---|---|---|
committer | alzobnin <alzobnin@yandex-team.com> | 2022-12-15 18:44:25 +0300 |
commit | 6b780718b1af069992f4f7311c1cb753c8a68d05 (patch) | |
tree | b6904be74d2722cf575508e05be1bb81184785e3 /library/cpp/codecs/greedy_dict | |
parent | feb341993178f4dc73afc5930dcb1442ec306bfd (diff) | |
download | ydb-6b780718b1af069992f4f7311c1cb753c8a68d05.tar.gz |
Restrict max length of learned prefixes and fix solar codec
Diffstat (limited to 'library/cpp/codecs/greedy_dict')
-rw-r--r-- | library/cpp/codecs/greedy_dict/gd_builder.cpp | 8 | ||||
-rw-r--r-- | library/cpp/codecs/greedy_dict/gd_builder.h | 4 |
2 files changed, 6 insertions, 6 deletions
diff --git a/library/cpp/codecs/greedy_dict/gd_builder.cpp b/library/cpp/codecs/greedy_dict/gd_builder.cpp index 844e07d5a0..33e104926e 100644 --- a/library/cpp/codecs/greedy_dict/gd_builder.cpp +++ b/library/cpp/codecs/greedy_dict/gd_builder.cpp @@ -53,7 +53,7 @@ namespace NGreedyDict { Current->SetModelP(); } - ui32 TDictBuilder::BuildNextGeneration(ui32 maxent) { + ui32 TDictBuilder::BuildNextGeneration(ui32 maxent, ui32 maxlen) { TAutoPtr<TEntrySet> newset = new TEntrySet; newset->InitWithAlpha(); maxent -= newset->size(); @@ -86,7 +86,7 @@ namespace NGreedyDict { const TEntry& next = set.Get(Next(it->first)); float modelp = ModelP(prev.Count, next.Count, total); ui32 cnt = it->second; - if (cnt > mincnt && StatTest(test, modelp, cnt, total) > minpval) + if (cnt > mincnt && StatTest(test, modelp, cnt, total) > minpval && prev.Len() + next.Len() <= maxlen) Candidates.push_back(TCandidate(-Score(score, prev.Len() + next.Len(), modelp, cnt, total), it->first)); } } @@ -113,7 +113,7 @@ namespace NGreedyDict { return deletions + additions; } - ui32 TDictBuilder::Build(ui32 maxentries, ui32 maxiters, ui32 mindiff) { + ui32 TDictBuilder::Build(ui32 maxentries, ui32 maxiters, ui32 maxlen, ui32 mindiff) { /* size_t totalsz = 0; for (auto it : Input) totalsz += it.size();*/ @@ -128,7 +128,7 @@ namespace NGreedyDict { Clog << Sprintf("%-110s RSS=%" PRIu32 "M", mess.data(), (ui32)(TRusage::Get().MaxRss >> 20)) << Endl; } - ui32 diff = BuildNextGeneration(maxentries); + ui32 diff = BuildNextGeneration(maxentries, maxlen); if (Current->size() == maxentries && diff < mindiff) break; diff --git a/library/cpp/codecs/greedy_dict/gd_builder.h b/library/cpp/codecs/greedy_dict/gd_builder.h index b8e9a5e37b..8b20007425 100644 --- a/library/cpp/codecs/greedy_dict/gd_builder.h +++ b/library/cpp/codecs/greedy_dict/gd_builder.h @@ -69,11 +69,11 @@ namespace NGreedyDict { return std::move(Current); } - ui32 /*iters*/ Build(ui32 maxentries, ui32 maxiters = 16, ui32 mindiff = 10); + ui32 /*iters*/ Build(ui32 maxentries, ui32 maxiters = 16, ui32 maxlen = -1, ui32 mindiff = 10); public: void RebuildCounts(ui32 maxcand, bool final); - ui32 /*diff size*/ BuildNextGeneration(ui32 maxent); + ui32 /*diff size*/ BuildNextGeneration(ui32 maxent, ui32 maxlen); static bool IsCompound(ui64 ent) { return ent & 0xFFFFFFFF00000000ULL; |