diff options
| author | alzobnin <[email protected]> | 2022-12-15 18:44:25 +0300 | 
|---|---|---|
| committer | alzobnin <[email protected]> | 2022-12-15 18:44:25 +0300 | 
| commit | 6b780718b1af069992f4f7311c1cb753c8a68d05 (patch) | |
| tree | b6904be74d2722cf575508e05be1bb81184785e3 /library/cpp | |
| parent | feb341993178f4dc73afc5930dcb1442ec306bfd (diff) | |
Restrict max length of learned prefixes and fix solar codec
Diffstat (limited to 'library/cpp')
| -rw-r--r-- | library/cpp/codecs/greedy_dict/gd_builder.cpp | 8 | ||||
| -rw-r--r-- | library/cpp/codecs/greedy_dict/gd_builder.h | 4 | ||||
| -rw-r--r-- | library/cpp/codecs/solar_codec.cpp | 7 | 
3 files changed, 11 insertions, 8 deletions
| diff --git a/library/cpp/codecs/greedy_dict/gd_builder.cpp b/library/cpp/codecs/greedy_dict/gd_builder.cpp index 844e07d5a04..33e104926ec 100644 --- a/library/cpp/codecs/greedy_dict/gd_builder.cpp +++ b/library/cpp/codecs/greedy_dict/gd_builder.cpp @@ -53,7 +53,7 @@ namespace NGreedyDict {          Current->SetModelP();      } -    ui32 TDictBuilder::BuildNextGeneration(ui32 maxent) { +    ui32 TDictBuilder::BuildNextGeneration(ui32 maxent, ui32 maxlen) {          TAutoPtr<TEntrySet> newset = new TEntrySet;          newset->InitWithAlpha();          maxent -= newset->size(); @@ -86,7 +86,7 @@ namespace NGreedyDict {                      const TEntry& next = set.Get(Next(it->first));                      float modelp = ModelP(prev.Count, next.Count, total);                      ui32 cnt = it->second; -                    if (cnt > mincnt && StatTest(test, modelp, cnt, total) > minpval) +                    if (cnt > mincnt && StatTest(test, modelp, cnt, total) > minpval && prev.Len() + next.Len() <= maxlen)                          Candidates.push_back(TCandidate(-Score(score, prev.Len() + next.Len(), modelp, cnt, total), it->first));                  }              } @@ -113,7 +113,7 @@ namespace NGreedyDict {          return deletions + additions;      } -    ui32 TDictBuilder::Build(ui32 maxentries, ui32 maxiters, ui32 mindiff) { +    ui32 TDictBuilder::Build(ui32 maxentries, ui32 maxiters, ui32 maxlen, ui32 mindiff) {          /* size_t totalsz = 0;          for (auto it : Input)              totalsz += it.size();*/ @@ -128,7 +128,7 @@ namespace NGreedyDict {                  Clog << Sprintf("%-110s RSS=%" PRIu32 "M", mess.data(), (ui32)(TRusage::Get().MaxRss >> 20)) << Endl;              } -            ui32 diff = BuildNextGeneration(maxentries); +            ui32 diff = BuildNextGeneration(maxentries, maxlen);              if (Current->size() == maxentries && diff < mindiff)                  break; diff --git a/library/cpp/codecs/greedy_dict/gd_builder.h b/library/cpp/codecs/greedy_dict/gd_builder.h index b8e9a5e37be..8b20007425c 100644 --- a/library/cpp/codecs/greedy_dict/gd_builder.h +++ b/library/cpp/codecs/greedy_dict/gd_builder.h @@ -69,11 +69,11 @@ namespace NGreedyDict {              return std::move(Current);          } -        ui32 /*iters*/ Build(ui32 maxentries, ui32 maxiters = 16, ui32 mindiff = 10); +        ui32 /*iters*/ Build(ui32 maxentries, ui32 maxiters = 16, ui32 maxlen = -1, ui32 mindiff = 10);      public:          void RebuildCounts(ui32 maxcand, bool final); -        ui32 /*diff size*/ BuildNextGeneration(ui32 maxent); +        ui32 /*diff size*/ BuildNextGeneration(ui32 maxent, ui32 maxlen);          static bool IsCompound(ui64 ent) {              return ent & 0xFFFFFFFF00000000ULL; diff --git a/library/cpp/codecs/solar_codec.cpp b/library/cpp/codecs/solar_codec.cpp index d0692fe2a46..916bbbd5d18 100644 --- a/library/cpp/codecs/solar_codec.cpp +++ b/library/cpp/codecs/solar_codec.cpp @@ -17,6 +17,8 @@ namespace NCodecs {      void TSolarCodec::DoLearn(ISequenceReader& r) {          using namespace NGreedyDict; +        const ui32 maxlen = Max<ui32>() / Max<ui32>(MaxEntries, 1); +          Decoder.clear();          Pool.Clear(); @@ -34,7 +36,7 @@ namespace NCodecs {              {                  TDictBuilder b(Settings);                  b.SetInput(bufs); -                b.Build(MaxEntries, MaxIterations); +                b.Build(MaxEntries, MaxIterations, maxlen);                  set = b.ReleaseEntrySet();              } @@ -47,7 +49,8 @@ namespace NCodecs {              tmp.reserve(set->size());              for (const auto& it : *set) { -                tmp.push_back(std::make_pair(-it.Score, TStringBuf(it.Str).Trunc(Max<ui32>() / Max<ui32>(MaxEntries, 1)))); +                Y_ENSURE(it.Str.Size() <= maxlen); +                tmp.push_back(std::make_pair(-it.Score, it.Str));              }              Sort(tmp.begin(), tmp.end()); | 
