1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
|
#pragma once
#include "gd_entry.h"
#include <util/generic/hash.h>
#include <util/random/fast.h>
namespace NGreedyDict {
struct TBuildSettings {
EEntryStatTest StatTest = EST_SIMPLE_NORM;
EEntryScore Score = ES_LEN_SIMPLE;
float MinPValue = 0.75;
ui32 MinAbsCount = 10;
ui32 GrowLimit = 10; // times of maxentries
bool Verbose = false;
};
class TDictBuilder {
using TCompoundCounts = THashMap<ui64, ui32, THash<ui64>, TEqualTo<ui64>, TPoolAllocator>;
using TCandidate = std::pair<float, ui64>;
using TCandidates = TVector<TCandidate>;
private:
TFastRng64 Rng{0x1a5d0ac170565c1c, 0x0be7bc27, 0x6235f6f57820aa0d, 0xafdc7fb};
TStringBufs Input;
THolder<TEntrySet> Current;
TMemoryPool CompoundCountsPool;
THolder<TCompoundCounts> CompoundCounts;
TCandidates Candidates;
TBuildSettings Settings;
public:
TDictBuilder(const TBuildSettings& s = TBuildSettings())
: CompoundCountsPool(8112, TMemoryPool::TLinearGrow::Instance())
, Settings(s)
{
}
void SetInput(const TStringBufs& in) {
Input = in;
}
const TBuildSettings& GetSettings() const {
return Settings;
}
TBuildSettings& GetSettings() {
return Settings;
}
void SetSettings(const TBuildSettings& s) {
Settings = s;
}
TEntrySet& EntrySet() {
return *Current;
}
const TEntrySet& EntrySet() const {
return *Current;
}
THolder<TEntrySet> ReleaseEntrySet() {
return std::move(Current);
}
ui32 /*iters*/ Build(ui32 maxentries, ui32 maxiters = 16, ui32 mindiff = 10);
public:
void RebuildCounts(ui32 maxcand, bool final);
ui32 /*diff size*/ BuildNextGeneration(ui32 maxent);
static bool IsCompound(ui64 ent) {
return ent & 0xFFFFFFFF00000000ULL;
}
static ui32 Next(ui64 ent) {
return ent;
}
static ui32 Prev(ui64 ent) {
return (ent >> 32) - 1;
}
static ui64 Compose(ui32 prev, ui32 next) {
return ((prev + 1ULL) << 32) | next;
}
};
}
|