aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoralzobnin <alzobnin@yandex-team.com>2022-12-15 18:44:25 +0300
committeralzobnin <alzobnin@yandex-team.com>2022-12-15 18:44:25 +0300
commit6b780718b1af069992f4f7311c1cb753c8a68d05 (patch)
treeb6904be74d2722cf575508e05be1bb81184785e3
parentfeb341993178f4dc73afc5930dcb1442ec306bfd (diff)
downloadydb-6b780718b1af069992f4f7311c1cb753c8a68d05.tar.gz
Restrict max length of learned prefixes and fix solar codec
-rw-r--r--library/cpp/codecs/greedy_dict/gd_builder.cpp8
-rw-r--r--library/cpp/codecs/greedy_dict/gd_builder.h4
-rw-r--r--library/cpp/codecs/solar_codec.cpp7
3 files changed, 11 insertions, 8 deletions
diff --git a/library/cpp/codecs/greedy_dict/gd_builder.cpp b/library/cpp/codecs/greedy_dict/gd_builder.cpp
index 844e07d5a0..33e104926e 100644
--- a/library/cpp/codecs/greedy_dict/gd_builder.cpp
+++ b/library/cpp/codecs/greedy_dict/gd_builder.cpp
@@ -53,7 +53,7 @@ namespace NGreedyDict {
Current->SetModelP();
}
- ui32 TDictBuilder::BuildNextGeneration(ui32 maxent) {
+ ui32 TDictBuilder::BuildNextGeneration(ui32 maxent, ui32 maxlen) {
TAutoPtr<TEntrySet> newset = new TEntrySet;
newset->InitWithAlpha();
maxent -= newset->size();
@@ -86,7 +86,7 @@ namespace NGreedyDict {
const TEntry& next = set.Get(Next(it->first));
float modelp = ModelP(prev.Count, next.Count, total);
ui32 cnt = it->second;
- if (cnt > mincnt && StatTest(test, modelp, cnt, total) > minpval)
+ if (cnt > mincnt && StatTest(test, modelp, cnt, total) > minpval && prev.Len() + next.Len() <= maxlen)
Candidates.push_back(TCandidate(-Score(score, prev.Len() + next.Len(), modelp, cnt, total), it->first));
}
}
@@ -113,7 +113,7 @@ namespace NGreedyDict {
return deletions + additions;
}
- ui32 TDictBuilder::Build(ui32 maxentries, ui32 maxiters, ui32 mindiff) {
+ ui32 TDictBuilder::Build(ui32 maxentries, ui32 maxiters, ui32 maxlen, ui32 mindiff) {
/* size_t totalsz = 0;
for (auto it : Input)
totalsz += it.size();*/
@@ -128,7 +128,7 @@ namespace NGreedyDict {
Clog << Sprintf("%-110s RSS=%" PRIu32 "M", mess.data(), (ui32)(TRusage::Get().MaxRss >> 20)) << Endl;
}
- ui32 diff = BuildNextGeneration(maxentries);
+ ui32 diff = BuildNextGeneration(maxentries, maxlen);
if (Current->size() == maxentries && diff < mindiff)
break;
diff --git a/library/cpp/codecs/greedy_dict/gd_builder.h b/library/cpp/codecs/greedy_dict/gd_builder.h
index b8e9a5e37b..8b20007425 100644
--- a/library/cpp/codecs/greedy_dict/gd_builder.h
+++ b/library/cpp/codecs/greedy_dict/gd_builder.h
@@ -69,11 +69,11 @@ namespace NGreedyDict {
return std::move(Current);
}
- ui32 /*iters*/ Build(ui32 maxentries, ui32 maxiters = 16, ui32 mindiff = 10);
+ ui32 /*iters*/ Build(ui32 maxentries, ui32 maxiters = 16, ui32 maxlen = -1, ui32 mindiff = 10);
public:
void RebuildCounts(ui32 maxcand, bool final);
- ui32 /*diff size*/ BuildNextGeneration(ui32 maxent);
+ ui32 /*diff size*/ BuildNextGeneration(ui32 maxent, ui32 maxlen);
static bool IsCompound(ui64 ent) {
return ent & 0xFFFFFFFF00000000ULL;
diff --git a/library/cpp/codecs/solar_codec.cpp b/library/cpp/codecs/solar_codec.cpp
index d0692fe2a4..916bbbd5d1 100644
--- a/library/cpp/codecs/solar_codec.cpp
+++ b/library/cpp/codecs/solar_codec.cpp
@@ -17,6 +17,8 @@ namespace NCodecs {
void TSolarCodec::DoLearn(ISequenceReader& r) {
using namespace NGreedyDict;
+ const ui32 maxlen = Max<ui32>() / Max<ui32>(MaxEntries, 1);
+
Decoder.clear();
Pool.Clear();
@@ -34,7 +36,7 @@ namespace NCodecs {
{
TDictBuilder b(Settings);
b.SetInput(bufs);
- b.Build(MaxEntries, MaxIterations);
+ b.Build(MaxEntries, MaxIterations, maxlen);
set = b.ReleaseEntrySet();
}
@@ -47,7 +49,8 @@ namespace NCodecs {
tmp.reserve(set->size());
for (const auto& it : *set) {
- tmp.push_back(std::make_pair(-it.Score, TStringBuf(it.Str).Trunc(Max<ui32>() / Max<ui32>(MaxEntries, 1))));
+ Y_ENSURE(it.Str.Size() <= maxlen);
+ tmp.push_back(std::make_pair(-it.Score, it.Str));
}
Sort(tmp.begin(), tmp.end());