aboutsummaryrefslogblamecommitdiffstats
path: root/library/cpp/codecs/solar_codec.cpp
blob: d0692fe2a46d6ab8908b9074429e07b94978bc4f (plain) (tree)
1
2
3
4
5
6
7
8
9
10
11
                        
                                                      
 
                                                                     



                               
                                                               
                                              
                           
 
                                                   
 
                        
 
                               
 

                                                                         
 


                                                     
 


                                                   
 
                                          
         
                                     
         
                                                      
 

                                                                                                                           
 
                                         
 
                                            
 



                                                                                     
         
                                
 
                           
 

                                                      
 

                                                                
                                                                                                 
             
 
                                           
 

                                                                
                                                                             
                 
 
                                   

             
                                                       
     


                                                      
     
 





                                              
 
                                              
 


                                                                                        
                                             
         
 
                                     
 

                                        
 


                                                                              
 
                                

     
#include "solar_codec.h"

#include <library/cpp/codecs/greedy_dict/gd_builder.h>

#include <library/cpp/containers/comptrie/comptrie_builder.h>
#include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h>
#include <util/stream/length.h>
#include <util/string/printf.h>
#include <util/ysaveload.h>

namespace NCodecs {
    static inline ui32 Append(TBuffer& pool, TStringBuf data) {
        pool.Append(data.data(), data.size());
        return pool.Size();
    }

    void TSolarCodec::DoLearn(ISequenceReader& r) {
        using namespace NGreedyDict;

        Decoder.clear();
        Pool.Clear();

        THolder<TEntrySet> set;

        {
            TMemoryPool pool(8112, TMemoryPool::TLinearGrow::Instance());
            TStringBufs bufs;

            TStringBuf m;
            while (r.NextRegion(m)) {
                bufs.push_back(pool.AppendString(m));
            }

            {
                TDictBuilder b(Settings);
                b.SetInput(bufs);
                b.Build(MaxEntries, MaxIterations);

                set = b.ReleaseEntrySet();
            }
        }

        set->SetScores(ES_LEN_COUNT);

        {
            TVector<std::pair<float, TStringBuf>> tmp;
            tmp.reserve(set->size());

            for (const auto& it : *set) {
                tmp.push_back(std::make_pair(-it.Score, TStringBuf(it.Str).Trunc(Max<ui32>() / Max<ui32>(MaxEntries, 1))));
            }

            Sort(tmp.begin(), tmp.end());

            Decoder.reserve(tmp.size() + 1);
            Decoder.push_back(0);

            for (const auto& it : tmp) {
                Y_ENSURE(Decoder.back() == Pool.Size(), "learning invariant failed");
                ui32 endoff = Append(Pool, it.second);
                Decoder.push_back(endoff);
            }
        }

        Pool.ShrinkToFit();
        Decoder.shrink_to_fit();

        TBufferOutput bout;

        {
            TVector<std::pair<TStringBuf, ui32>> tmp2;
            tmp2.reserve(Decoder.size());

            for (ui32 i = 1, sz = Decoder.size(); i < sz; ++i) {
                TStringBuf s = DoDecode(i);
                tmp2.push_back(std::make_pair(s, i - 1));
                Y_ENSURE(s.size() == (Decoder[i] - Decoder[i - 1]), "learning invariant failed");
            }

            Sort(tmp2.begin(), tmp2.end());

            {
                TEncoder::TBuilder builder(CTBF_PREFIX_GROUPED);
                for (const auto& it : tmp2) {
                    builder.Add(it.first.data(), it.first.size(), it.second);
                }

                builder.Save(bout);
            }
        }

        Encoder.Init(TBlob::FromBuffer(bout.Buffer()));
    }

    void TSolarCodec::Save(IOutputStream* out) const {
        TBlob b = Encoder.Data();
        ::Save(out, (ui32)b.Size());
        out->Write(b.Data(), b.Size());
    }

    void TSolarCodec::Load(IInputStream* in) {
        ui32 sz;
        ::Load(in, sz);
        TLengthLimitedInput lin(in, sz);
        Encoder.Init(TBlob::FromStream(lin));
        Pool.Clear();
        Decoder.clear();

        TVector<std::pair<ui32, TString>> tmp;

        ui32 poolsz = 0;
        for (TEncoder::TConstIterator it = Encoder.Begin(); it != Encoder.End(); ++it) {
            const TString& s = it.GetKey();
            tmp.push_back(std::make_pair(it.GetValue(), !s ? TString("\0", 1) : s));
            poolsz += Max<ui32>(s.size(), 1);
        }

        Sort(tmp.begin(), tmp.end());

        Pool.Reserve(poolsz);
        Decoder.reserve(tmp.size() + 1);
        Decoder.push_back(0);

        for (ui32 i = 0, sz2 = tmp.size(); i < sz2; ++i) {
            Y_ENSURE(i == tmp[i].first, "oops! " << i << " " << tmp[i].first);
            Decoder.push_back(Append(Pool, tmp[i].second));
        }

        Pool.ShrinkToFit();
        Decoder.shrink_to_fit();
    }

}