aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/codecs/greedy_dict/gd_builder.h
blob: ab0057e1ca7cd848b212103f8491c640594a856b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#pragma once

#include "gd_entry.h"

#include <util/generic/hash.h>
#include <util/random/fast.h>

namespace NGreedyDict {
    struct TBuildSettings { 
        EEntryStatTest StatTest = EST_SIMPLE_NORM; 
        EEntryScore Score = ES_LEN_SIMPLE; 

        float MinPValue = 0.75; 
        ui32 MinAbsCount = 10; 
        ui32 GrowLimit = 10; // times of maxentries 
        bool Verbose = false; 
    }; 

    class TDictBuilder { 
        using TCompoundCounts = THashMap<ui64, ui32, THash<ui64>, TEqualTo<ui64>, TPoolAllocator>; 
        using TCandidate = std::pair<float, ui64>; 
        using TCandidates = TVector<TCandidate>; 

    private: 
        TFastRng64 Rng{0x1a5d0ac170565c1c, 0x0be7bc27, 0x6235f6f57820aa0d, 0xafdc7fb}; 
        TStringBufs Input; 

        THolder<TEntrySet> Current; 

        TMemoryPool CompoundCountsPool; 
        THolder<TCompoundCounts> CompoundCounts; 

        TCandidates Candidates; 

        TBuildSettings Settings; 

    public: 
        TDictBuilder(const TBuildSettings& s = TBuildSettings()) 
            : CompoundCountsPool(8112, TMemoryPool::TLinearGrow::Instance()) 
            , Settings(s) 
        { 
        } 

        void SetInput(const TStringBufs& in) { 
            Input = in; 
        } 

        const TBuildSettings& GetSettings() const { 
            return Settings; 
        } 

        TBuildSettings& GetSettings() { 
            return Settings; 
        } 

        void SetSettings(const TBuildSettings& s) { 
            Settings = s; 
        } 

        TEntrySet& EntrySet() { 
            return *Current; 
        } 

        const TEntrySet& EntrySet() const { 
            return *Current; 
        } 

        THolder<TEntrySet> ReleaseEntrySet() { 
            return std::move(Current); 
        } 

        ui32 /*iters*/ Build(ui32 maxentries, ui32 maxiters = 16, ui32 mindiff = 10); 

    public: 
        void RebuildCounts(ui32 maxcand, bool final); 
        ui32 /*diff size*/ BuildNextGeneration(ui32 maxent); 

        static bool IsCompound(ui64 ent) { 
            return ent & 0xFFFFFFFF00000000ULL; 
        } 

        static ui32 Next(ui64 ent) { 
            return ent; 
        } 
        static ui32 Prev(ui64 ent) { 
            return (ent >> 32) - 1; 
        } 

        static ui64 Compose(ui32 prev, ui32 next) { 
            return ((prev + 1ULL) << 32) | next; 
        } 
    }; 

}