diff options
author | Anton Samokhvalov <pg83@yandex.ru> | 2022-02-10 16:45:17 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:17 +0300 |
commit | d3a398281c6fd1d3672036cb2d63f842d2cb28c5 (patch) | |
tree | dd4bd3ca0f36b817e96812825ffaf10d645803f2 /library/cpp/codecs/solar_codec.cpp | |
parent | 72cb13b4aff9bc9cf22e49251bc8fd143f82538f (diff) | |
download | ydb-d3a398281c6fd1d3672036cb2d63f842d2cb28c5.tar.gz |
Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 2 of 2.
Diffstat (limited to 'library/cpp/codecs/solar_codec.cpp')
-rw-r--r-- | library/cpp/codecs/solar_codec.cpp | 170 |
1 files changed, 85 insertions, 85 deletions
diff --git a/library/cpp/codecs/solar_codec.cpp b/library/cpp/codecs/solar_codec.cpp index 6c08b9e7bd..d0692fe2a4 100644 --- a/library/cpp/codecs/solar_codec.cpp +++ b/library/cpp/codecs/solar_codec.cpp @@ -9,125 +9,125 @@ #include <util/ysaveload.h> namespace NCodecs { - static inline ui32 Append(TBuffer& pool, TStringBuf data) { + static inline ui32 Append(TBuffer& pool, TStringBuf data) { pool.Append(data.data(), data.size()); - return pool.Size(); - } + return pool.Size(); + } + + void TSolarCodec::DoLearn(ISequenceReader& r) { + using namespace NGreedyDict; - void TSolarCodec::DoLearn(ISequenceReader& r) { - using namespace NGreedyDict; + Decoder.clear(); + Pool.Clear(); - Decoder.clear(); - Pool.Clear(); + THolder<TEntrySet> set; - THolder<TEntrySet> set; + { + TMemoryPool pool(8112, TMemoryPool::TLinearGrow::Instance()); + TStringBufs bufs; - { - TMemoryPool pool(8112, TMemoryPool::TLinearGrow::Instance()); - TStringBufs bufs; + TStringBuf m; + while (r.NextRegion(m)) { + bufs.push_back(pool.AppendString(m)); + } - TStringBuf m; - while (r.NextRegion(m)) { - bufs.push_back(pool.AppendString(m)); - } + { + TDictBuilder b(Settings); + b.SetInput(bufs); + b.Build(MaxEntries, MaxIterations); - { - TDictBuilder b(Settings); - b.SetInput(bufs); - b.Build(MaxEntries, MaxIterations); - - set = b.ReleaseEntrySet(); - } + set = b.ReleaseEntrySet(); + } } - set->SetScores(ES_LEN_COUNT); - + set->SetScores(ES_LEN_COUNT); + { - TVector<std::pair<float, TStringBuf>> tmp; - tmp.reserve(set->size()); + TVector<std::pair<float, TStringBuf>> tmp; + tmp.reserve(set->size()); - for (const auto& it : *set) { - tmp.push_back(std::make_pair(-it.Score, TStringBuf(it.Str).Trunc(Max<ui32>() / Max<ui32>(MaxEntries, 1)))); - } + for (const auto& it : *set) { + tmp.push_back(std::make_pair(-it.Score, TStringBuf(it.Str).Trunc(Max<ui32>() / Max<ui32>(MaxEntries, 1)))); + } - Sort(tmp.begin(), tmp.end()); + Sort(tmp.begin(), tmp.end()); - Decoder.reserve(tmp.size() + 1); - Decoder.push_back(0); + Decoder.reserve(tmp.size() + 1); + Decoder.push_back(0); - for (const auto& it : tmp) { - Y_ENSURE(Decoder.back() == Pool.Size(), "learning invariant failed"); - ui32 endoff = Append(Pool, it.second); - Decoder.push_back(endoff); - } + for (const auto& it : tmp) { + Y_ENSURE(Decoder.back() == Pool.Size(), "learning invariant failed"); + ui32 endoff = Append(Pool, it.second); + Decoder.push_back(endoff); + } } - Pool.ShrinkToFit(); - Decoder.shrink_to_fit(); + Pool.ShrinkToFit(); + Decoder.shrink_to_fit(); - TBufferOutput bout; + TBufferOutput bout; - { - TVector<std::pair<TStringBuf, ui32>> tmp2; - tmp2.reserve(Decoder.size()); + { + TVector<std::pair<TStringBuf, ui32>> tmp2; + tmp2.reserve(Decoder.size()); - for (ui32 i = 1, sz = Decoder.size(); i < sz; ++i) { - TStringBuf s = DoDecode(i); - tmp2.push_back(std::make_pair(s, i - 1)); + for (ui32 i = 1, sz = Decoder.size(); i < sz; ++i) { + TStringBuf s = DoDecode(i); + tmp2.push_back(std::make_pair(s, i - 1)); Y_ENSURE(s.size() == (Decoder[i] - Decoder[i - 1]), "learning invariant failed"); - } + } - Sort(tmp2.begin(), tmp2.end()); + Sort(tmp2.begin(), tmp2.end()); - { - TEncoder::TBuilder builder(CTBF_PREFIX_GROUPED); - for (const auto& it : tmp2) { + { + TEncoder::TBuilder builder(CTBF_PREFIX_GROUPED); + for (const auto& it : tmp2) { builder.Add(it.first.data(), it.first.size(), it.second); - } + } - builder.Save(bout); + builder.Save(bout); } } - Encoder.Init(TBlob::FromBuffer(bout.Buffer())); + Encoder.Init(TBlob::FromBuffer(bout.Buffer())); + } + + void TSolarCodec::Save(IOutputStream* out) const { + TBlob b = Encoder.Data(); + ::Save(out, (ui32)b.Size()); + out->Write(b.Data(), b.Size()); } - void TSolarCodec::Save(IOutputStream* out) const { - TBlob b = Encoder.Data(); - ::Save(out, (ui32)b.Size()); - out->Write(b.Data(), b.Size()); - } - - void TSolarCodec::Load(IInputStream* in) { - ui32 sz; - ::Load(in, sz); - TLengthLimitedInput lin(in, sz); - Encoder.Init(TBlob::FromStream(lin)); - Pool.Clear(); - Decoder.clear(); - - TVector<std::pair<ui32, TString>> tmp; - - ui32 poolsz = 0; - for (TEncoder::TConstIterator it = Encoder.Begin(); it != Encoder.End(); ++it) { - const TString& s = it.GetKey(); - tmp.push_back(std::make_pair(it.GetValue(), !s ? TString("\0", 1) : s)); + void TSolarCodec::Load(IInputStream* in) { + ui32 sz; + ::Load(in, sz); + TLengthLimitedInput lin(in, sz); + Encoder.Init(TBlob::FromStream(lin)); + Pool.Clear(); + Decoder.clear(); + + TVector<std::pair<ui32, TString>> tmp; + + ui32 poolsz = 0; + for (TEncoder::TConstIterator it = Encoder.Begin(); it != Encoder.End(); ++it) { + const TString& s = it.GetKey(); + tmp.push_back(std::make_pair(it.GetValue(), !s ? TString("\0", 1) : s)); poolsz += Max<ui32>(s.size(), 1); - } + } - Sort(tmp.begin(), tmp.end()); + Sort(tmp.begin(), tmp.end()); - Pool.Reserve(poolsz); - Decoder.reserve(tmp.size() + 1); - Decoder.push_back(0); + Pool.Reserve(poolsz); + Decoder.reserve(tmp.size() + 1); + Decoder.push_back(0); - for (ui32 i = 0, sz2 = tmp.size(); i < sz2; ++i) { - Y_ENSURE(i == tmp[i].first, "oops! " << i << " " << tmp[i].first); - Decoder.push_back(Append(Pool, tmp[i].second)); - } + for (ui32 i = 0, sz2 = tmp.size(); i < sz2; ++i) { + Y_ENSURE(i == tmp[i].first, "oops! " << i << " " << tmp[i].first); + Decoder.push_back(Append(Pool, tmp[i].second)); + } - Pool.ShrinkToFit(); - Decoder.shrink_to_fit(); + Pool.ShrinkToFit(); + Decoder.shrink_to_fit(); } } |