aboutsummaryrefslogblamecommitdiffstats
path: root/library/cpp/codecs/greedy_dict/gd_builder.h
blob: 8b20007425cc27bd2ad2d939d623ef48c253161f (plain) (tree)
1
2
3
4
5
6
7
8






                              

                                                  
 



                                                   
 


                                                                                                  
 

                                                                                      
 
                                   
 
                                                
 
                               
 
                                
 




                                                                            
 

                                              
 

                                                   
 

                                       
 

                                                   
 

                               
 

                                           
 

                                              
 
                                                                                                       
 
                                                     
                                                                         
 

                                               
 




                                    
 


                                                   
 
#pragma once

#include "gd_entry.h"

#include <util/generic/hash.h>
#include <util/random/fast.h>

namespace NGreedyDict {
    struct TBuildSettings {
        EEntryStatTest StatTest = EST_SIMPLE_NORM;
        EEntryScore Score = ES_LEN_SIMPLE;

        float MinPValue = 0.75;
        ui32 MinAbsCount = 10;
        ui32 GrowLimit = 10; // times of maxentries
        bool Verbose = false;
    };

    class TDictBuilder {
        using TCompoundCounts = THashMap<ui64, ui32, THash<ui64>, TEqualTo<ui64>, TPoolAllocator>;
        using TCandidate = std::pair<float, ui64>;
        using TCandidates = TVector<TCandidate>;

    private:
        TFastRng64 Rng{0x1a5d0ac170565c1c, 0x0be7bc27, 0x6235f6f57820aa0d, 0xafdc7fb};
        TStringBufs Input;

        THolder<TEntrySet> Current;

        TMemoryPool CompoundCountsPool;
        THolder<TCompoundCounts> CompoundCounts;

        TCandidates Candidates;

        TBuildSettings Settings;

    public:
        TDictBuilder(const TBuildSettings& s = TBuildSettings())
            : CompoundCountsPool(8112, TMemoryPool::TLinearGrow::Instance())
            , Settings(s)
        {
        }

        void SetInput(const TStringBufs& in) {
            Input = in;
        }

        const TBuildSettings& GetSettings() const {
            return Settings;
        }

        TBuildSettings& GetSettings() {
            return Settings;
        }

        void SetSettings(const TBuildSettings& s) {
            Settings = s;
        }

        TEntrySet& EntrySet() {
            return *Current;
        }

        const TEntrySet& EntrySet() const {
            return *Current;
        }

        THolder<TEntrySet> ReleaseEntrySet() {
            return std::move(Current);
        }

        ui32 /*iters*/ Build(ui32 maxentries, ui32 maxiters = 16, ui32 maxlen = -1, ui32 mindiff = 10);

    public:
        void RebuildCounts(ui32 maxcand, bool final);
        ui32 /*diff size*/ BuildNextGeneration(ui32 maxent, ui32 maxlen);

        static bool IsCompound(ui64 ent) {
            return ent & 0xFFFFFFFF00000000ULL;
        }

        static ui32 Next(ui64 ent) {
            return ent;
        }
        static ui32 Prev(ui64 ent) {
            return (ent >> 32) - 1;
        }

        static ui64 Compose(ui32 prev, ui32 next) {
            return ((prev + 1ULL) << 32) | next;
        }
    };

}