diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/codecs/solar_codec.cpp | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/codecs/solar_codec.cpp')
-rw-r--r-- | library/cpp/codecs/solar_codec.cpp | 133 |
1 files changed, 133 insertions, 0 deletions
diff --git a/library/cpp/codecs/solar_codec.cpp b/library/cpp/codecs/solar_codec.cpp new file mode 100644 index 00000000000..d0692fe2a46 --- /dev/null +++ b/library/cpp/codecs/solar_codec.cpp @@ -0,0 +1,133 @@ +#include "solar_codec.h" + +#include <library/cpp/codecs/greedy_dict/gd_builder.h> + +#include <library/cpp/containers/comptrie/comptrie_builder.h> +#include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h> +#include <util/stream/length.h> +#include <util/string/printf.h> +#include <util/ysaveload.h> + +namespace NCodecs { + static inline ui32 Append(TBuffer& pool, TStringBuf data) { + pool.Append(data.data(), data.size()); + return pool.Size(); + } + + void TSolarCodec::DoLearn(ISequenceReader& r) { + using namespace NGreedyDict; + + Decoder.clear(); + Pool.Clear(); + + THolder<TEntrySet> set; + + { + TMemoryPool pool(8112, TMemoryPool::TLinearGrow::Instance()); + TStringBufs bufs; + + TStringBuf m; + while (r.NextRegion(m)) { + bufs.push_back(pool.AppendString(m)); + } + + { + TDictBuilder b(Settings); + b.SetInput(bufs); + b.Build(MaxEntries, MaxIterations); + + set = b.ReleaseEntrySet(); + } + } + + set->SetScores(ES_LEN_COUNT); + + { + TVector<std::pair<float, TStringBuf>> tmp; + tmp.reserve(set->size()); + + for (const auto& it : *set) { + tmp.push_back(std::make_pair(-it.Score, TStringBuf(it.Str).Trunc(Max<ui32>() / Max<ui32>(MaxEntries, 1)))); + } + + Sort(tmp.begin(), tmp.end()); + + Decoder.reserve(tmp.size() + 1); + Decoder.push_back(0); + + for (const auto& it : tmp) { + Y_ENSURE(Decoder.back() == Pool.Size(), "learning invariant failed"); + ui32 endoff = Append(Pool, it.second); + Decoder.push_back(endoff); + } + } + + Pool.ShrinkToFit(); + Decoder.shrink_to_fit(); + + TBufferOutput bout; + + { + TVector<std::pair<TStringBuf, ui32>> tmp2; + tmp2.reserve(Decoder.size()); + + for (ui32 i = 1, sz = Decoder.size(); i < sz; ++i) { + TStringBuf s = DoDecode(i); + tmp2.push_back(std::make_pair(s, i - 1)); + Y_ENSURE(s.size() == (Decoder[i] - Decoder[i - 1]), "learning invariant failed"); + } + + Sort(tmp2.begin(), tmp2.end()); + + { + TEncoder::TBuilder builder(CTBF_PREFIX_GROUPED); + for (const auto& it : tmp2) { + builder.Add(it.first.data(), it.first.size(), it.second); + } + + builder.Save(bout); + } + } + + Encoder.Init(TBlob::FromBuffer(bout.Buffer())); + } + + void TSolarCodec::Save(IOutputStream* out) const { + TBlob b = Encoder.Data(); + ::Save(out, (ui32)b.Size()); + out->Write(b.Data(), b.Size()); + } + + void TSolarCodec::Load(IInputStream* in) { + ui32 sz; + ::Load(in, sz); + TLengthLimitedInput lin(in, sz); + Encoder.Init(TBlob::FromStream(lin)); + Pool.Clear(); + Decoder.clear(); + + TVector<std::pair<ui32, TString>> tmp; + + ui32 poolsz = 0; + for (TEncoder::TConstIterator it = Encoder.Begin(); it != Encoder.End(); ++it) { + const TString& s = it.GetKey(); + tmp.push_back(std::make_pair(it.GetValue(), !s ? TString("\0", 1) : s)); + poolsz += Max<ui32>(s.size(), 1); + } + + Sort(tmp.begin(), tmp.end()); + + Pool.Reserve(poolsz); + Decoder.reserve(tmp.size() + 1); + Decoder.push_back(0); + + for (ui32 i = 0, sz2 = tmp.size(); i < sz2; ++i) { + Y_ENSURE(i == tmp[i].first, "oops! " << i << " " << tmp[i].first); + Decoder.push_back(Append(Pool, tmp[i].second)); + } + + Pool.ShrinkToFit(); + Decoder.shrink_to_fit(); + } + +} |