diff options
author | Ruslan Kovalev <ruslan.a.kovalev@gmail.com> | 2022-02-10 16:46:44 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:46:44 +0300 |
commit | 59e19371de37995fcb36beb16cd6ec030af960bc (patch) | |
tree | fa68e36093ebff8b805462e9e6d331fe9d348214 /library/cpp/codecs/solar_codec.cpp | |
parent | 89db6fe2fe2c32d2a832ddfeb04e8d078e301084 (diff) | |
download | ydb-59e19371de37995fcb36beb16cd6ec030af960bc.tar.gz |
Restoring authorship annotation for Ruslan Kovalev <ruslan.a.kovalev@gmail.com>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/codecs/solar_codec.cpp')
-rw-r--r-- | library/cpp/codecs/solar_codec.cpp | 88 |
1 files changed, 44 insertions, 44 deletions
diff --git a/library/cpp/codecs/solar_codec.cpp b/library/cpp/codecs/solar_codec.cpp index d0692fe2a4..088bdead19 100644 --- a/library/cpp/codecs/solar_codec.cpp +++ b/library/cpp/codecs/solar_codec.cpp @@ -1,36 +1,36 @@ -#include "solar_codec.h" - +#include "solar_codec.h" + #include <library/cpp/codecs/greedy_dict/gd_builder.h> - + #include <library/cpp/containers/comptrie/comptrie_builder.h> #include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h> -#include <util/stream/length.h> -#include <util/string/printf.h> -#include <util/ysaveload.h> - -namespace NCodecs { +#include <util/stream/length.h> +#include <util/string/printf.h> +#include <util/ysaveload.h> + +namespace NCodecs { static inline ui32 Append(TBuffer& pool, TStringBuf data) { pool.Append(data.data(), data.size()); return pool.Size(); } - + void TSolarCodec::DoLearn(ISequenceReader& r) { using namespace NGreedyDict; - + Decoder.clear(); Pool.Clear(); - + THolder<TEntrySet> set; - + { TMemoryPool pool(8112, TMemoryPool::TLinearGrow::Instance()); TStringBufs bufs; - + TStringBuf m; while (r.NextRegion(m)) { bufs.push_back(pool.AppendString(m)); } - + { TDictBuilder b(Settings); b.SetInput(bufs); @@ -38,66 +38,66 @@ namespace NCodecs { set = b.ReleaseEntrySet(); } - } - + } + set->SetScores(ES_LEN_COUNT); - { + { TVector<std::pair<float, TStringBuf>> tmp; tmp.reserve(set->size()); - + for (const auto& it : *set) { tmp.push_back(std::make_pair(-it.Score, TStringBuf(it.Str).Trunc(Max<ui32>() / Max<ui32>(MaxEntries, 1)))); } - + Sort(tmp.begin(), tmp.end()); - + Decoder.reserve(tmp.size() + 1); Decoder.push_back(0); - + for (const auto& it : tmp) { Y_ENSURE(Decoder.back() == Pool.Size(), "learning invariant failed"); ui32 endoff = Append(Pool, it.second); Decoder.push_back(endoff); } - } - + } + Pool.ShrinkToFit(); Decoder.shrink_to_fit(); - + TBufferOutput bout; - + { TVector<std::pair<TStringBuf, ui32>> tmp2; tmp2.reserve(Decoder.size()); - + for (ui32 i = 1, sz = Decoder.size(); i < sz; ++i) { TStringBuf s = DoDecode(i); tmp2.push_back(std::make_pair(s, i - 1)); Y_ENSURE(s.size() == (Decoder[i] - Decoder[i - 1]), "learning invariant failed"); } - + Sort(tmp2.begin(), tmp2.end()); - + { TEncoder::TBuilder builder(CTBF_PREFIX_GROUPED); for (const auto& it : tmp2) { builder.Add(it.first.data(), it.first.size(), it.second); } - + builder.Save(bout); - } - } - + } + } + Encoder.Init(TBlob::FromBuffer(bout.Buffer())); - } - + } + void TSolarCodec::Save(IOutputStream* out) const { TBlob b = Encoder.Data(); ::Save(out, (ui32)b.Size()); out->Write(b.Data(), b.Size()); } - + void TSolarCodec::Load(IInputStream* in) { ui32 sz; ::Load(in, sz); @@ -105,29 +105,29 @@ namespace NCodecs { Encoder.Init(TBlob::FromStream(lin)); Pool.Clear(); Decoder.clear(); - + TVector<std::pair<ui32, TString>> tmp; - + ui32 poolsz = 0; for (TEncoder::TConstIterator it = Encoder.Begin(); it != Encoder.End(); ++it) { const TString& s = it.GetKey(); tmp.push_back(std::make_pair(it.GetValue(), !s ? TString("\0", 1) : s)); poolsz += Max<ui32>(s.size(), 1); } - + Sort(tmp.begin(), tmp.end()); - + Pool.Reserve(poolsz); Decoder.reserve(tmp.size() + 1); Decoder.push_back(0); - + for (ui32 i = 0, sz2 = tmp.size(); i < sz2; ++i) { Y_ENSURE(i == tmp[i].first, "oops! " << i << " " << tmp[i].first); Decoder.push_back(Append(Pool, tmp[i].second)); } - + Pool.ShrinkToFit(); Decoder.shrink_to_fit(); - } - -} + } + +} |