diff options
author | Anton Samokhvalov <pg83@yandex.ru> | 2022-02-10 16:45:15 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:15 +0300 |
commit | 72cb13b4aff9bc9cf22e49251bc8fd143f82538f (patch) | |
tree | da2c34829458c7d4e74bdfbdf85dff449e9e7fb8 /library/cpp/codecs/greedy_dict | |
parent | 778e51ba091dc39e7b7fcab2b9cf4dbedfb6f2b5 (diff) | |
download | ydb-72cb13b4aff9bc9cf22e49251bc8fd143f82538f.tar.gz |
Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/codecs/greedy_dict')
-rw-r--r-- | library/cpp/codecs/greedy_dict/gd_builder.cpp | 190 | ||||
-rw-r--r-- | library/cpp/codecs/greedy_dict/gd_builder.h | 168 | ||||
-rw-r--r-- | library/cpp/codecs/greedy_dict/gd_entry.cpp | 126 | ||||
-rw-r--r-- | library/cpp/codecs/greedy_dict/gd_entry.h | 126 | ||||
-rw-r--r-- | library/cpp/codecs/greedy_dict/gd_stats.h | 116 | ||||
-rw-r--r-- | library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp | 8 |
6 files changed, 367 insertions, 367 deletions
diff --git a/library/cpp/codecs/greedy_dict/gd_builder.cpp b/library/cpp/codecs/greedy_dict/gd_builder.cpp index 561bfbca01..802c721753 100644 --- a/library/cpp/codecs/greedy_dict/gd_builder.cpp +++ b/library/cpp/codecs/greedy_dict/gd_builder.cpp @@ -9,134 +9,134 @@ #include <util/system/rusage.h> namespace NGreedyDict { - void TDictBuilder::RebuildCounts(ui32 maxcand, bool final) { - if (!Current) { + void TDictBuilder::RebuildCounts(ui32 maxcand, bool final) { + if (!Current) { Current = MakeHolder<TEntrySet>(); - Current->InitWithAlpha(); - } + Current->InitWithAlpha(); + } - TEntrySet& set = *Current; + TEntrySet& set = *Current; - for (auto& it : set) - it.Count = 0; + for (auto& it : set) + it.Count = 0; - CompoundCounts = nullptr; - CompoundCountsPool.Clear(); + CompoundCounts = nullptr; + CompoundCountsPool.Clear(); - if (!final) { + if (!final) { CompoundCounts = MakeHolder<TCompoundCounts>(&CompoundCountsPool); - CompoundCounts->reserve(maxcand); - } + CompoundCounts->reserve(maxcand); + } - Shuffle(Input.begin(), Input.end(), Rng); + Shuffle(Input.begin(), Input.end(), Rng); - for (auto str : Input) { - if (!final && CompoundCounts->size() > maxcand) - break; + for (auto str : Input) { + if (!final && CompoundCounts->size() > maxcand) + break; - i32 prev = -1; + i32 prev = -1; - while (!!str) { - TEntry* e = set.FindPrefix(str); - ui32 num = e->Number; + while (!!str) { + TEntry* e = set.FindPrefix(str); + ui32 num = e->Number; - e->Count += 1; - if (!final && prev >= 0) { - (*CompoundCounts)[Compose(prev, num)] += 1; - } + e->Count += 1; + if (!final && prev >= 0) { + (*CompoundCounts)[Compose(prev, num)] += 1; + } - prev = num; - ++set.TotalCount; + prev = num; + ++set.TotalCount; } - } + } - Current->SetModelP(); + Current->SetModelP(); } - ui32 TDictBuilder::BuildNextGeneration(ui32 maxent) { - TAutoPtr<TEntrySet> newset = new TEntrySet; - newset->InitWithAlpha(); - maxent -= newset->size(); - - ui32 additions = 0; - ui32 deletions = 0; - - { - const TEntrySet& set = *Current; - - Candidates.clear(); - const ui32 total = set.TotalCount; - const float minpval = Settings.MinPValue; - const EEntryStatTest test = Settings.StatTest; - const EEntryScore score = Settings.Score; - const ui32 mincnt = Settings.MinAbsCount; - - for (const auto& it : set) { - const TEntry& e = it; - float modelp = e.ModelP; - ui32 cnt = e.Count; - - if (e.HasPrefix() && e.Count > mincnt && StatTest(test, modelp, cnt, total) > minpval) - Candidates.push_back(TCandidate(-Score(score, e.Len(), modelp, cnt, total), it.Number)); - } - - if (!!CompoundCounts) { - for (TCompoundCounts::const_iterator it = CompoundCounts->begin(); it != CompoundCounts->end(); ++it) { - const TEntry& prev = set.Get(Prev(it->first)); - const TEntry& next = set.Get(Next(it->first)); - float modelp = ModelP(prev.Count, next.Count, total); - ui32 cnt = it->second; - if (cnt > mincnt && StatTest(test, modelp, cnt, total) > minpval) - Candidates.push_back(TCandidate(-Score(score, prev.Len() + next.Len(), modelp, cnt, total), it->first)); - } + ui32 TDictBuilder::BuildNextGeneration(ui32 maxent) { + TAutoPtr<TEntrySet> newset = new TEntrySet; + newset->InitWithAlpha(); + maxent -= newset->size(); + + ui32 additions = 0; + ui32 deletions = 0; + + { + const TEntrySet& set = *Current; + + Candidates.clear(); + const ui32 total = set.TotalCount; + const float minpval = Settings.MinPValue; + const EEntryStatTest test = Settings.StatTest; + const EEntryScore score = Settings.Score; + const ui32 mincnt = Settings.MinAbsCount; + + for (const auto& it : set) { + const TEntry& e = it; + float modelp = e.ModelP; + ui32 cnt = e.Count; + + if (e.HasPrefix() && e.Count > mincnt && StatTest(test, modelp, cnt, total) > minpval) + Candidates.push_back(TCandidate(-Score(score, e.Len(), modelp, cnt, total), it.Number)); + } + + if (!!CompoundCounts) { + for (TCompoundCounts::const_iterator it = CompoundCounts->begin(); it != CompoundCounts->end(); ++it) { + const TEntry& prev = set.Get(Prev(it->first)); + const TEntry& next = set.Get(Next(it->first)); + float modelp = ModelP(prev.Count, next.Count, total); + ui32 cnt = it->second; + if (cnt > mincnt && StatTest(test, modelp, cnt, total) > minpval) + Candidates.push_back(TCandidate(-Score(score, prev.Len() + next.Len(), modelp, cnt, total), it->first)); + } } - Sort(Candidates.begin(), Candidates.end()); + Sort(Candidates.begin(), Candidates.end()); - if (Candidates.size() > maxent) - Candidates.resize(maxent); + if (Candidates.size() > maxent) + Candidates.resize(maxent); - for (const auto& candidate : Candidates) { - if (IsCompound(candidate.second)) { - additions++; - newset->Add(set.Get(Prev(candidate.second)).Str, set.Get(Next(candidate.second)).Str); - } else { - newset->Add(set.Get(candidate.second).Str); - } + for (const auto& candidate : Candidates) { + if (IsCompound(candidate.second)) { + additions++; + newset->Add(set.Get(Prev(candidate.second)).Str, set.Get(Next(candidate.second)).Str); + } else { + newset->Add(set.Get(candidate.second).Str); + } } - - deletions = set.size() - (newset->size() - additions); + + deletions = set.size() - (newset->size() - additions); } - Current = newset; - Current->BuildHierarchy(); - return deletions + additions; + Current = newset; + Current->BuildHierarchy(); + return deletions + additions; } - ui32 TDictBuilder::Build(ui32 maxentries, ui32 maxiters, ui32 mindiff) { - size_t totalsz = 0; - for (auto it : Input) + ui32 TDictBuilder::Build(ui32 maxentries, ui32 maxiters, ui32 mindiff) { + size_t totalsz = 0; + for (auto it : Input) totalsz += it.size(); - while (maxiters) { - maxiters--; + while (maxiters) { + maxiters--; - RebuildCounts(maxentries * Settings.GrowLimit, false); + RebuildCounts(maxentries * Settings.GrowLimit, false); - if (Settings.Verbose) { - TString mess = Sprintf("iter:%" PRIu32 " sz:%" PRIu32 " pend:%" PRIu32, maxiters, (ui32)Current->size(), (ui32)CompoundCounts->size()); + if (Settings.Verbose) { + TString mess = Sprintf("iter:%" PRIu32 " sz:%" PRIu32 " pend:%" PRIu32, maxiters, (ui32)Current->size(), (ui32)CompoundCounts->size()); Clog << Sprintf("%-110s RSS=%" PRIu32 "M", mess.data(), (ui32)(TRusage::Get().MaxRss >> 20)) << Endl; - } - - ui32 diff = BuildNextGeneration(maxentries); + } - if (Current->size() == maxentries && diff < mindiff) - break; + ui32 diff = BuildNextGeneration(maxentries); + + if (Current->size() == maxentries && diff < mindiff) + break; } - RebuildCounts(0, true); - Current->SetScores(Settings.Score); - return maxiters; + RebuildCounts(0, true); + Current->SetScores(Settings.Score); + return maxiters; } } diff --git a/library/cpp/codecs/greedy_dict/gd_builder.h b/library/cpp/codecs/greedy_dict/gd_builder.h index b8e9a5e37b..ab0057e1ca 100644 --- a/library/cpp/codecs/greedy_dict/gd_builder.h +++ b/library/cpp/codecs/greedy_dict/gd_builder.h @@ -6,89 +6,89 @@ #include <util/random/fast.h> namespace NGreedyDict { - struct TBuildSettings { - EEntryStatTest StatTest = EST_SIMPLE_NORM; - EEntryScore Score = ES_LEN_SIMPLE; - - float MinPValue = 0.75; - ui32 MinAbsCount = 10; - ui32 GrowLimit = 10; // times of maxentries - bool Verbose = false; - }; - - class TDictBuilder { - using TCompoundCounts = THashMap<ui64, ui32, THash<ui64>, TEqualTo<ui64>, TPoolAllocator>; - using TCandidate = std::pair<float, ui64>; - using TCandidates = TVector<TCandidate>; - - private: - TFastRng64 Rng{0x1a5d0ac170565c1c, 0x0be7bc27, 0x6235f6f57820aa0d, 0xafdc7fb}; - TStringBufs Input; - - THolder<TEntrySet> Current; - - TMemoryPool CompoundCountsPool; - THolder<TCompoundCounts> CompoundCounts; - - TCandidates Candidates; - - TBuildSettings Settings; - - public: - TDictBuilder(const TBuildSettings& s = TBuildSettings()) - : CompoundCountsPool(8112, TMemoryPool::TLinearGrow::Instance()) - , Settings(s) - { - } - - void SetInput(const TStringBufs& in) { - Input = in; - } - - const TBuildSettings& GetSettings() const { - return Settings; - } - - TBuildSettings& GetSettings() { - return Settings; - } - - void SetSettings(const TBuildSettings& s) { - Settings = s; - } - - TEntrySet& EntrySet() { - return *Current; - } - - const TEntrySet& EntrySet() const { - return *Current; - } - - THolder<TEntrySet> ReleaseEntrySet() { - return std::move(Current); - } - - ui32 /*iters*/ Build(ui32 maxentries, ui32 maxiters = 16, ui32 mindiff = 10); - - public: - void RebuildCounts(ui32 maxcand, bool final); - ui32 /*diff size*/ BuildNextGeneration(ui32 maxent); - - static bool IsCompound(ui64 ent) { - return ent & 0xFFFFFFFF00000000ULL; - } - - static ui32 Next(ui64 ent) { - return ent; - } - static ui32 Prev(ui64 ent) { - return (ent >> 32) - 1; - } - - static ui64 Compose(ui32 prev, ui32 next) { - return ((prev + 1ULL) << 32) | next; - } - }; + struct TBuildSettings { + EEntryStatTest StatTest = EST_SIMPLE_NORM; + EEntryScore Score = ES_LEN_SIMPLE; + + float MinPValue = 0.75; + ui32 MinAbsCount = 10; + ui32 GrowLimit = 10; // times of maxentries + bool Verbose = false; + }; + + class TDictBuilder { + using TCompoundCounts = THashMap<ui64, ui32, THash<ui64>, TEqualTo<ui64>, TPoolAllocator>; + using TCandidate = std::pair<float, ui64>; + using TCandidates = TVector<TCandidate>; + + private: + TFastRng64 Rng{0x1a5d0ac170565c1c, 0x0be7bc27, 0x6235f6f57820aa0d, 0xafdc7fb}; + TStringBufs Input; + + THolder<TEntrySet> Current; + + TMemoryPool CompoundCountsPool; + THolder<TCompoundCounts> CompoundCounts; + + TCandidates Candidates; + + TBuildSettings Settings; + + public: + TDictBuilder(const TBuildSettings& s = TBuildSettings()) + : CompoundCountsPool(8112, TMemoryPool::TLinearGrow::Instance()) + , Settings(s) + { + } + + void SetInput(const TStringBufs& in) { + Input = in; + } + + const TBuildSettings& GetSettings() const { + return Settings; + } + + TBuildSettings& GetSettings() { + return Settings; + } + + void SetSettings(const TBuildSettings& s) { + Settings = s; + } + + TEntrySet& EntrySet() { + return *Current; + } + + const TEntrySet& EntrySet() const { + return *Current; + } + + THolder<TEntrySet> ReleaseEntrySet() { + return std::move(Current); + } + + ui32 /*iters*/ Build(ui32 maxentries, ui32 maxiters = 16, ui32 mindiff = 10); + + public: + void RebuildCounts(ui32 maxcand, bool final); + ui32 /*diff size*/ BuildNextGeneration(ui32 maxent); + + static bool IsCompound(ui64 ent) { + return ent & 0xFFFFFFFF00000000ULL; + } + + static ui32 Next(ui64 ent) { + return ent; + } + static ui32 Prev(ui64 ent) { + return (ent >> 32) - 1; + } + + static ui64 Compose(ui32 prev, ui32 next) { + return ((prev + 1ULL) << 32) | next; + } + }; } diff --git a/library/cpp/codecs/greedy_dict/gd_entry.cpp b/library/cpp/codecs/greedy_dict/gd_entry.cpp index 2c315c7f7c..0603a9fca8 100644 --- a/library/cpp/codecs/greedy_dict/gd_entry.cpp +++ b/library/cpp/codecs/greedy_dict/gd_entry.cpp @@ -5,94 +5,94 @@ #include <util/generic/singleton.h> namespace NGreedyDict { - class TAlphas { - char Memory[512]; - - public: - TStringBufs Alphas; - - TAlphas() { - for (ui32 i = 0; i < 256; ++i) { - Memory[2 * i] = (char)i; - Memory[2 * i + 1] = 0; - - Alphas.push_back(TStringBuf(&Memory[2 * i], 1)); - } + class TAlphas { + char Memory[512]; + + public: + TStringBufs Alphas; + + TAlphas() { + for (ui32 i = 0; i < 256; ++i) { + Memory[2 * i] = (char)i; + Memory[2 * i + 1] = 0; + + Alphas.push_back(TStringBuf(&Memory[2 * i], 1)); + } + } + }; + + void TEntrySet::InitWithAlpha() { + Pool.ClearKeepFirstChunk(); + const TStringBufs& a = Singleton<TAlphas>()->Alphas; + for (auto it : a) { + Add(it); } - }; - - void TEntrySet::InitWithAlpha() { - Pool.ClearKeepFirstChunk(); - const TStringBufs& a = Singleton<TAlphas>()->Alphas; - for (auto it : a) { - Add(it); - } - BuildHierarchy(); + BuildHierarchy(); } - void TEntrySet::BuildHierarchy() { - Sort(begin(), end(), TEntry::StrLess); + void TEntrySet::BuildHierarchy() { + Sort(begin(), end(), TEntry::StrLess); - TCompactTrieBuilder<char, ui32, TAsIsPacker<ui32>> builder(CTBF_PREFIX_GROUPED); + TCompactTrieBuilder<char, ui32, TAsIsPacker<ui32>> builder(CTBF_PREFIX_GROUPED); - for (iterator it = begin(); it != end(); ++it) { - it->Number = (it - begin()); - TStringBuf suff = it->Str; - size_t len = 0; - ui32 val = 0; + for (iterator it = begin(); it != end(); ++it) { + it->Number = (it - begin()); + TStringBuf suff = it->Str; + size_t len = 0; + ui32 val = 0; if (builder.FindLongestPrefix(suff.data(), suff.size(), &len, &val) && len) { - it->NearestPrefix = val; - } + it->NearestPrefix = val; + } builder.Add(suff.data(), suff.size(), it->Number); } - TBufferOutput bout; - builder.Save(bout); - Trie.Init(TBlob::FromBuffer(bout.Buffer())); + TBufferOutput bout; + builder.Save(bout); + Trie.Init(TBlob::FromBuffer(bout.Buffer())); } - TEntry* TEntrySet::FindPrefix(TStringBuf& str) { - size_t len = 0; - ui32 off = 0; + TEntry* TEntrySet::FindPrefix(TStringBuf& str) { + size_t len = 0; + ui32 off = 0; - if (!Trie.FindLongestPrefix(str, &len, &off)) { - return nullptr; - } + if (!Trie.FindLongestPrefix(str, &len, &off)) { + return nullptr; + } - str.Skip(len); - return &Get(off); + str.Skip(len); + return &Get(off); } - void TEntrySet::SetModelP() { - for (iterator it = begin(); it != end(); ++it) { - TEntry& e = *it; + void TEntrySet::SetModelP() { + for (iterator it = begin(); it != end(); ++it) { + TEntry& e = *it; - if (!e.HasPrefix()) { - e.ModelP = 0; - continue; - } + if (!e.HasPrefix()) { + e.ModelP = 0; + continue; + } - TStringBuf suff = e.Str; - const TEntry& p = Get(e.NearestPrefix); - suff.Skip(p.Len()); + TStringBuf suff = e.Str; + const TEntry& p = Get(e.NearestPrefix); + suff.Skip(p.Len()); - float modelp = float(p.Count + e.Count) / TotalCount; + float modelp = float(p.Count + e.Count) / TotalCount; - while (!!suff) { - TEntry* pp = FindPrefix(suff); - modelp *= float(pp->Count + e.Count) / TotalCount; - } + while (!!suff) { + TEntry* pp = FindPrefix(suff); + modelp *= float(pp->Count + e.Count) / TotalCount; + } - e.ModelP = modelp; + e.ModelP = modelp; } } - void TEntrySet::SetScores(EEntryScore s) { - for (auto& it : *this) { - it.Score = Score(s, it.Len(), it.ModelP, it.Count, TotalCount); - } + void TEntrySet::SetScores(EEntryScore s) { + for (auto& it : *this) { + it.Score = Score(s, it.Len(), it.ModelP, it.Count, TotalCount); + } } } diff --git a/library/cpp/codecs/greedy_dict/gd_entry.h b/library/cpp/codecs/greedy_dict/gd_entry.h index 18b5be0e15..e123c66b4a 100644 --- a/library/cpp/codecs/greedy_dict/gd_entry.h +++ b/library/cpp/codecs/greedy_dict/gd_entry.h @@ -11,93 +11,93 @@ #include <util/memory/pool.h> namespace NGreedyDict { - using TStringBufs = TVector<TStringBuf>; + using TStringBufs = TVector<TStringBuf>; - struct TEntry { - static const i32 NoPrefix = -1; + struct TEntry { + static const i32 NoPrefix = -1; - TStringBuf Str; + TStringBuf Str; - i32 NearestPrefix = NoPrefix; - ui32 Count = 0; - ui32 Number = 0; - float ModelP = 0; - float Score = 0; + i32 NearestPrefix = NoPrefix; + ui32 Count = 0; + ui32 Number = 0; + float ModelP = 0; + float Score = 0; - TEntry(TStringBuf b = TStringBuf(), ui32 cnt = 0) - : Str(b) - , Count(cnt) - { - } + TEntry(TStringBuf b = TStringBuf(), ui32 cnt = 0) + : Str(b) + , Count(cnt) + { + } - bool HasPrefix() const { - return NearestPrefix != NoPrefix; - } - ui32 Len() const { + bool HasPrefix() const { + return NearestPrefix != NoPrefix; + } + ui32 Len() const { return Str.size(); - } + } - static bool StrLess(const TEntry& a, const TEntry& b) { - return a.Str < b.Str; - } - static bool NumberLess(const TEntry& a, const TEntry& b) { - return a.Number < b.Number; - } - static bool ScoreMore(const TEntry& a, const TEntry& b) { - return a.Score > b.Score; - } - }; + static bool StrLess(const TEntry& a, const TEntry& b) { + return a.Str < b.Str; + } + static bool NumberLess(const TEntry& a, const TEntry& b) { + return a.Number < b.Number; + } + static bool ScoreMore(const TEntry& a, const TEntry& b) { + return a.Score > b.Score; + } + }; - class TEntrySet: public TVector<TEntry>, TNonCopyable { - TMemoryPool Pool{8112}; - TCompactTrie<char, ui32, TAsIsPacker<ui32>> Trie; + class TEntrySet: public TVector<TEntry>, TNonCopyable { + TMemoryPool Pool{8112}; + TCompactTrie<char, ui32, TAsIsPacker<ui32>> Trie; - public: - ui32 TotalCount = 0; + public: + ui32 TotalCount = 0; - void InitWithAlpha(); + void InitWithAlpha(); - void Add(TStringBuf a) { + void Add(TStringBuf a) { push_back(TStringBuf(Pool.Append(a.data(), a.size()), a.size())); - } + } - void Add(TStringBuf a, TStringBuf b) { + void Add(TStringBuf a, TStringBuf b) { size_t sz = a.size() + b.size(); - char* p = (char*)Pool.Allocate(sz); + char* p = (char*)Pool.Allocate(sz); memcpy(p, a.data(), a.size()); memcpy(p + a.size(), b.data(), b.size()); - push_back(TStringBuf(p, sz)); - } + push_back(TStringBuf(p, sz)); + } - TEntry& Get(ui32 idx) { - return (*this)[idx]; - } + TEntry& Get(ui32 idx) { + return (*this)[idx]; + } - const TEntry& Get(ui32 idx) const { - return (*this)[idx]; - } + const TEntry& Get(ui32 idx) const { + return (*this)[idx]; + } - void BuildHierarchy(); + void BuildHierarchy(); - // longest prefix - TEntry* FindPrefix(TStringBuf& str); + // longest prefix + TEntry* FindPrefix(TStringBuf& str); - const TEntry* FindPrefix(TStringBuf& str) const { - return ((TEntrySet*)this)->FindPrefix(str); - } + const TEntry* FindPrefix(TStringBuf& str) const { + return ((TEntrySet*)this)->FindPrefix(str); + } - const TEntry* FirstPrefix(const TEntry& e, TStringBuf& suff) { - if (!e.HasPrefix()) - return nullptr; + const TEntry* FirstPrefix(const TEntry& e, TStringBuf& suff) { + if (!e.HasPrefix()) + return nullptr; - const TEntry& p = Get(e.NearestPrefix); - suff = e.Str; + const TEntry& p = Get(e.NearestPrefix); + suff = e.Str; suff.Skip(p.Str.size()); - return &p; - } + return &p; + } - void SetModelP(); - void SetScores(EEntryScore); - }; + void SetModelP(); + void SetScores(EEntryScore); + }; } diff --git a/library/cpp/codecs/greedy_dict/gd_stats.h b/library/cpp/codecs/greedy_dict/gd_stats.h index b63c4c38d2..90f46a0fb9 100644 --- a/library/cpp/codecs/greedy_dict/gd_stats.h +++ b/library/cpp/codecs/greedy_dict/gd_stats.h @@ -1,78 +1,78 @@ #pragma once -#include <util/generic/ymath.h> +#include <util/generic/ymath.h> #include <util/generic/algorithm.h> #include <util/generic/yexception.h> namespace NGreedyDict { - enum EEntryScore { - ES_COUNT, - ES_LEN_COUNT, - ES_SIMPLE, - ES_LEN_SIMPLE, - ES_SOLAR - }; + enum EEntryScore { + ES_COUNT, + ES_LEN_COUNT, + ES_SIMPLE, + ES_LEN_SIMPLE, + ES_SOLAR + }; - enum EEntryStatTest { - EST_NONE = 0, - EST_SIMPLE_NORM = 2 - }; + enum EEntryStatTest { + EST_NONE = 0, + EST_SIMPLE_NORM = 2 + }; - inline float ModelP(ui32 countA, ui32 countB, ui32 total) { - return float(countA) * countB / total / total; - } + inline float ModelP(ui32 countA, ui32 countB, ui32 total) { + return float(countA) * countB / total / total; + } - // P (ab | dependent) - inline float SimpleTest(float modelp, ui32 countAB, ui32 total) { - float realp = float(countAB) / total; - return modelp >= realp ? 0 : (realp - modelp); - } + // P (ab | dependent) + inline float SimpleTest(float modelp, ui32 countAB, ui32 total) { + float realp = float(countAB) / total; + return modelp >= realp ? 0 : (realp - modelp); + } - inline float SolarTest(float modelp, ui32 countAB, ui32 total) { - float realp = float(countAB) / total; - return modelp >= realp ? 0 : (modelp + realp * (log(realp / modelp) - 1)); - } + inline float SolarTest(float modelp, ui32 countAB, ui32 total) { + float realp = float(countAB) / total; + return modelp >= realp ? 0 : (modelp + realp * (log(realp / modelp) - 1)); + } - // P (ab | dependent) / P (ab) - inline float SimpleTestNorm(float modelp, ui32 countAB, ui32 total) { - float realp = float(countAB) / total; - return modelp >= realp ? 0 : (realp - modelp) / realp; - } + // P (ab | dependent) / P (ab) + inline float SimpleTestNorm(float modelp, ui32 countAB, ui32 total) { + float realp = float(countAB) / total; + return modelp >= realp ? 0 : (realp - modelp) / realp; + } - inline float StatTest(EEntryStatTest test, float modelp, ui32 countAB, ui32 total) { - if (!total) { - return 0; - } - switch (test) { - case EST_NONE: - return 1; - case EST_SIMPLE_NORM: - return SimpleTestNorm(modelp, countAB, total); - } - Y_FAIL("no way!"); + inline float StatTest(EEntryStatTest test, float modelp, ui32 countAB, ui32 total) { + if (!total) { + return 0; + } + switch (test) { + case EST_NONE: + return 1; + case EST_SIMPLE_NORM: + return SimpleTestNorm(modelp, countAB, total); + } + Y_FAIL("no way!"); return 0; } - inline float Score(EEntryScore score, ui32 len, float modelp, ui32 count, ui32 total) { - if (!total) { - return 0; - } - ui32 m = 1; - switch (score) { - case ES_LEN_COUNT: - m = len; + inline float Score(EEntryScore score, ui32 len, float modelp, ui32 count, ui32 total) { + if (!total) { + return 0; + } + ui32 m = 1; + switch (score) { + case ES_LEN_COUNT: + m = len; [[fallthrough]]; - case ES_COUNT: - return m * count; - case ES_LEN_SIMPLE: - m = len; + case ES_COUNT: + return m * count; + case ES_LEN_SIMPLE: + m = len; [[fallthrough]]; - case ES_SIMPLE: - return m * SimpleTest(modelp, count, total); - case ES_SOLAR: - return SolarTest(modelp, count, total); - } - Y_FAIL("no way!"); + case ES_SIMPLE: + return m * SimpleTest(modelp, count, total); + case ES_SOLAR: + return SolarTest(modelp, count, total); + } + Y_FAIL("no way!"); return 0; } diff --git a/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp b/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp index 679089a11b..e33976d333 100644 --- a/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp +++ b/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp @@ -6,11 +6,11 @@ #include <util/generic/ymath.h> class TGreedyDictTest: public TTestBase { - UNIT_TEST_SUITE(TGreedyDictTest); + UNIT_TEST_SUITE(TGreedyDictTest); UNIT_TEST(TestEntrySet) UNIT_TEST(TestBuilder0) UNIT_TEST(TestBuilder) - UNIT_TEST_SUITE_END(); + UNIT_TEST_SUITE_END(); void TestEntrySet() { using namespace NGreedyDict; @@ -120,7 +120,7 @@ class TGreedyDictTest: public TTestBase { } void FillData(NGreedyDict::TStringBufs& data) { - static const char* urls[] = {"http://53.ru/car/motors/foreign/opel/tigra/", "http://abakan.24au.ru/tender/85904/", "http://anm15.gulaig.com/", "http://avto-parts.com/mercedes-benz/mercedes-benz-w220-1998-2005/category-442/category-443/", "http://ballooncousin.co.uk/", "http://benzol.ru/equipment/?id=1211&parent=514", "http://blazingseorank.com/blazing-seo-rank-free-website-analysis-to-increase-rank-and-traffic-450.html", "http://blogblaugrana.contadorwebmasters.com/", "http://bristolhash.org.uk/bh3cntct.php", "http://broker.borovichi.ru/category/item/3/1/0/8/28/257", "http://canoncompactcamerax.blogspot.com/", "http://classifieds.smashits.com/p,107881,email-to-friend.htm", "http://conferences.ksde.org/Portals/132/FallAssessment/SAVETHEDAY-FA09.pdf", "http://eway.vn/raovat/325-dien-tu-gia-dung/337-dieu-hoa/98041-b1-sua-may-lanh-quan-binh-tan-sua-may-lanh-quan-binh-chanh-hh-979676119-toan-quoc.html", "http://gallery.e2bn.org/asset73204_8-.html", "http://goplay.nsw.gov.au/activities-for-kids/by/historic-houses-trust/?startdate=2012-07-10", "http://grichards19067.multiply.com/", "http://hotkovo.egent.ru/user/89262269084/", "http://howimetyourself.com/?redirect_to=http://gomiso.com/m/suits/seasons/2/episodes/2", "http://islamqa.com/hi/ref/9014/DEAD%20PEOPLE%20GOOD%20DEEDS", "http://lapras.rutube.ru/", "http://nceluiko.ya.ru/", "http://nyanyanyanyaa.beon.ru/", "http://ozbo.com/Leaf-River-DV-7SS-7-0-MP-Game-Camera-K1-32541.html", "http://sbantom.ru/catalog/chasy/632753.html", "http://shopingoff.com/index.php?option=com_virtuemart&Itemid=65&category_id=&page=shop.browse&manufacturer_id=122&limit=32&limitstart=96", "http://shopingoff.com/katalog-odezhdy/manufacturer/62-christian-audigier.html?limit=32&start=448", "https://webwinkel.ah.nl/process?fh_location=//ecommerce/nl_NL/categories%3C%7Becommerce_shoc1%7D/it_show_product_code_1384%3E%7B10%3B20%7D/pr_startdate%3C20120519/pr_enddate%3E20120519/pr_ltc_allowed%3E%7Bbowi%7D/categories%3C%7Becommerce_shoc1_1al%7D/categories%3C%7Becommerce_shoc1_1al_1ahal%7D&&action=albert_noscript.modules.build", "http://top100.rambler.ru/navi/?theme=208/210/371&rgn=17", "http://volgogradskaya-oblast.extra-m.ru/classifieds/rabota/vakansii/banki-investicii/901467/", "http://wikien4.appspot.com/wiki/Warburg_hypothesis", "http://wola_baranowska.kamerzysta24.com.pl/", "http://www.10dot0dot0dot1.com/", "http://www.anima-redux.ru/index.php?key=gifts+teenage+girls", "http://www.aquaticabyseaworld.com/Calendar.aspx/CP/CP/CP/sp-us/CP/CP/ParkMap/Tickets/Weather.aspx", "http://www.autousa.com/360spin/2012_cadillac_ctssportwagon_3.6awdpremiumcollection.htm", "http://www.booking.com/city/gb/paignton-aireborough.html?inac=0&lang=pl", "http://www.booking.com/city/it/vodo-cadore.en.html", "http://www.booking.com/district/us/new-york/rockefeller-center.html&lang=no", "http://www.booking.com/hotel/bg/crown-fort-club.lv.html", "http://www.booking.com/hotel/ca/gouverneur-rimouski.ar.html", "http://www.booking.com/hotel/ch/l-auberge-du-chalet-a-gobet.fi.html", "http://www.booking.com/hotel/de/mark-garni.ru.html?aid=337384;label=yandex-hotel-mark-garni-68157-%7Bparam1%7D", "http://www.booking.com/hotel/de/mercure-goldschmieding-castrop-rauxel.ro.html", "http://www.booking.com/hotel/de/zollenspieker-fahrhaus.fr.html", "http://www.booking.com/hotel/es/jardin-metropolitano.ca.html", "http://www.booking.com/hotel/fr/clim.fr.html", "http://www.booking.com/hotel/fr/radisson-sas-toulouse-airport.et.html", "http://www.booking.com/hotel/gb/stgileshotel.ro.html?srfid=68c7fe42a03653a8796c84435c5299e4X16?tab=4", "http://www.booking.com/hotel/gr/rodos-park-suites.ru.html", "http://www.booking.com/hotel/id/le-grande-suites-bali.ru.html", "http://www.booking.com/hotel/it/mozart.it.html?aid=321655", "http://www.booking.com/hotel/ni/bahia-del-sol-villas.ru.html?dcid=1;dva=0", "http://www.booking.com/hotel/nl/cpschiphol.ro.html.ro.html?tab=4", "http://www.booking.com/hotel/th/laem-din.en-gb.html", "http://www.booking.com/hotel/th/tinidee-ranong.en.html", "http://www.booking.com/hotel/us/best-western-plus-merrimack-valley.hu.html", "http://www.booking.com/hotel/vn/tan-hai-long.km.html", "http://www.booking.com/landmark/au/royal-brisbane-women-s-hospital.vi.html", "http://www.booking.com/landmark/hk/nam-cheong-station.html&lang=id", "http://www.booking.com/landmark/it/spanish-steps.ca.html", "http://www.booking.com/landmark/sg/asian-civilisations-museum.html&lang=fi", "http://www.booking.com/place/fi-1376029.pt.html", "http://www.booking.com/place/tn257337.pl.html", "http://www.booking.com/region/ca/niagarafalls.ar.html&selected_currency=PLN", "http://www.booking.com/region/mx/queretaro.pt-pt.html&selected_currency=AUD", "http://www.booking.com/searchresults.en.html?city=20063074", "http://www.booking.com/searchresults.et.html?checkin=;checkout=;city=-394632", "http://www.booking.com/searchresults.lv.html?region=3936", "http://www.cevredanismanlari.com/index.php/component/k2/index.php/mevzuat/genel-yazlar/item/dosyalar/index.php?option=com_k2&view=item&id=16:iso-14001-%C3%A7evre-y%C3%B6netim-sistemi&Itemid=132&limitstart=107120", "http://www.dh-wholesaler.com/MENS-POLO-RACING-TEE-RL-p-417.html", "http://www.employabilityonline.net/", "http://www.esso.inc.ru/board/tools.php?event=profile&pname=Invinerrq", "http://www.filesurgery.ru/searchfw/kids_clothes-3.html", "http://www.furnitureandcarpetsource.com/Item.aspx?ItemID=-2107311899&ItemNum=53-T3048", "http://www.gets.cn/product/Gold-Sand-Lampwork-Glass-Beads--Flat-round--28x28x13mm_p260717.html", "http://www.gets.cn/wholesale-Sterling-Silver-Pendant-Findings-3577_S--L-Star-P-1.html?view=1&by=1", "http://www.homeandgardenadvice.com/diy/Mortgages_Loans_and_Financing/9221.html", "http://www.hongkongairport.com/eng/index.html/passenger/passenger/transport/to-from-airport/business/about-the-airport/transport/shopping/entertainment/t2/passenger/interactive-map.html", "http://www.hongkongairport.com/eng/index.html/shopping/insideshopping/all/passenger/transfer-transit/all/airline-information/shopping/entertainment/t2/business/about-the-airport/welcome.html", "http://www.hongkongairport.com/eng/index.html/transport/business/about-the-airport/transport/business/airport-authority/passenger/shopping/dining/all/dining.html", "http://www.idedge.com/index.cfm/fuseaction/category.display/category_id/298/index.cfm", "http://www.istanbulburda.com/aramalar.php", "http://www.jewelryinthenet.com/ads/AdDetail.aspx?AdID=1-0311002490689&stid=22-0111001020877", "http://www.johnnydepp.ru/forum/index.php?showtopic=1629&mode=linearplus&view=findpost&p=186977", "http://www.johnnydepp.ru/forum/index.php?showtopic=476&st=60&p=87379&", "http://www.joseleano.com/joomla/index.php/audio", "http://www.kaplicarehberi.com/tag/sakar-ilicali-kaplicalari/feed", "http://www.khaber.com.tr/arama.html?key=%C3%A7avdar", "http://www.kiz-oyunlari1.com/1783/4437/4363/1056/4170/Bump-Copter2-.html", "http://www.kiz-oyunlari1.com/3752/2612/4175/1166/3649/1047/Angelina-Oyunu.html", "http://www.kiz-oyunlari1.com/4266/3630/3665/3286/4121/301/3274/Sinir-Sinekler-.html", "http://www.kuldiga.lv/index.php?f=8&cat=371", "http://www.kuldiga.lv/index.php/img/index.php?l=lv&art_id=1836&show_c=&cat=85", "http://www.patronessa.ru/remontiruemsya/kuzovnie30raboti.html", "http://www.rapdict.org/Nu_Money?title=Talk:Nu_Money&action=edit", "http://www.serafin-phu.tabor24.com/?page=8", "http://www.shoes-store.org/brand1/Kids/Minnetonka.html", "http://www.shoes-store.org/shoes-store.xml", "http://www.way2allah.com/khotab-download-34695.htm"}; + static const char* urls[] = {"http://53.ru/car/motors/foreign/opel/tigra/", "http://abakan.24au.ru/tender/85904/", "http://anm15.gulaig.com/", "http://avto-parts.com/mercedes-benz/mercedes-benz-w220-1998-2005/category-442/category-443/", "http://ballooncousin.co.uk/", "http://benzol.ru/equipment/?id=1211&parent=514", "http://blazingseorank.com/blazing-seo-rank-free-website-analysis-to-increase-rank-and-traffic-450.html", "http://blogblaugrana.contadorwebmasters.com/", "http://bristolhash.org.uk/bh3cntct.php", "http://broker.borovichi.ru/category/item/3/1/0/8/28/257", "http://canoncompactcamerax.blogspot.com/", "http://classifieds.smashits.com/p,107881,email-to-friend.htm", "http://conferences.ksde.org/Portals/132/FallAssessment/SAVETHEDAY-FA09.pdf", "http://eway.vn/raovat/325-dien-tu-gia-dung/337-dieu-hoa/98041-b1-sua-may-lanh-quan-binh-tan-sua-may-lanh-quan-binh-chanh-hh-979676119-toan-quoc.html", "http://gallery.e2bn.org/asset73204_8-.html", "http://goplay.nsw.gov.au/activities-for-kids/by/historic-houses-trust/?startdate=2012-07-10", "http://grichards19067.multiply.com/", "http://hotkovo.egent.ru/user/89262269084/", "http://howimetyourself.com/?redirect_to=http://gomiso.com/m/suits/seasons/2/episodes/2", "http://islamqa.com/hi/ref/9014/DEAD%20PEOPLE%20GOOD%20DEEDS", "http://lapras.rutube.ru/", "http://nceluiko.ya.ru/", "http://nyanyanyanyaa.beon.ru/", "http://ozbo.com/Leaf-River-DV-7SS-7-0-MP-Game-Camera-K1-32541.html", "http://sbantom.ru/catalog/chasy/632753.html", "http://shopingoff.com/index.php?option=com_virtuemart&Itemid=65&category_id=&page=shop.browse&manufacturer_id=122&limit=32&limitstart=96", "http://shopingoff.com/katalog-odezhdy/manufacturer/62-christian-audigier.html?limit=32&start=448", "https://webwinkel.ah.nl/process?fh_location=//ecommerce/nl_NL/categories%3C%7Becommerce_shoc1%7D/it_show_product_code_1384%3E%7B10%3B20%7D/pr_startdate%3C20120519/pr_enddate%3E20120519/pr_ltc_allowed%3E%7Bbowi%7D/categories%3C%7Becommerce_shoc1_1al%7D/categories%3C%7Becommerce_shoc1_1al_1ahal%7D&&action=albert_noscript.modules.build", "http://top100.rambler.ru/navi/?theme=208/210/371&rgn=17", "http://volgogradskaya-oblast.extra-m.ru/classifieds/rabota/vakansii/banki-investicii/901467/", "http://wikien4.appspot.com/wiki/Warburg_hypothesis", "http://wola_baranowska.kamerzysta24.com.pl/", "http://www.10dot0dot0dot1.com/", "http://www.anima-redux.ru/index.php?key=gifts+teenage+girls", "http://www.aquaticabyseaworld.com/Calendar.aspx/CP/CP/CP/sp-us/CP/CP/ParkMap/Tickets/Weather.aspx", "http://www.autousa.com/360spin/2012_cadillac_ctssportwagon_3.6awdpremiumcollection.htm", "http://www.booking.com/city/gb/paignton-aireborough.html?inac=0&lang=pl", "http://www.booking.com/city/it/vodo-cadore.en.html", "http://www.booking.com/district/us/new-york/rockefeller-center.html&lang=no", "http://www.booking.com/hotel/bg/crown-fort-club.lv.html", "http://www.booking.com/hotel/ca/gouverneur-rimouski.ar.html", "http://www.booking.com/hotel/ch/l-auberge-du-chalet-a-gobet.fi.html", "http://www.booking.com/hotel/de/mark-garni.ru.html?aid=337384;label=yandex-hotel-mark-garni-68157-%7Bparam1%7D", "http://www.booking.com/hotel/de/mercure-goldschmieding-castrop-rauxel.ro.html", "http://www.booking.com/hotel/de/zollenspieker-fahrhaus.fr.html", "http://www.booking.com/hotel/es/jardin-metropolitano.ca.html", "http://www.booking.com/hotel/fr/clim.fr.html", "http://www.booking.com/hotel/fr/radisson-sas-toulouse-airport.et.html", "http://www.booking.com/hotel/gb/stgileshotel.ro.html?srfid=68c7fe42a03653a8796c84435c5299e4X16?tab=4", "http://www.booking.com/hotel/gr/rodos-park-suites.ru.html", "http://www.booking.com/hotel/id/le-grande-suites-bali.ru.html", "http://www.booking.com/hotel/it/mozart.it.html?aid=321655", "http://www.booking.com/hotel/ni/bahia-del-sol-villas.ru.html?dcid=1;dva=0", "http://www.booking.com/hotel/nl/cpschiphol.ro.html.ro.html?tab=4", "http://www.booking.com/hotel/th/laem-din.en-gb.html", "http://www.booking.com/hotel/th/tinidee-ranong.en.html", "http://www.booking.com/hotel/us/best-western-plus-merrimack-valley.hu.html", "http://www.booking.com/hotel/vn/tan-hai-long.km.html", "http://www.booking.com/landmark/au/royal-brisbane-women-s-hospital.vi.html", "http://www.booking.com/landmark/hk/nam-cheong-station.html&lang=id", "http://www.booking.com/landmark/it/spanish-steps.ca.html", "http://www.booking.com/landmark/sg/asian-civilisations-museum.html&lang=fi", "http://www.booking.com/place/fi-1376029.pt.html", "http://www.booking.com/place/tn257337.pl.html", "http://www.booking.com/region/ca/niagarafalls.ar.html&selected_currency=PLN", "http://www.booking.com/region/mx/queretaro.pt-pt.html&selected_currency=AUD", "http://www.booking.com/searchresults.en.html?city=20063074", "http://www.booking.com/searchresults.et.html?checkin=;checkout=;city=-394632", "http://www.booking.com/searchresults.lv.html?region=3936", "http://www.cevredanismanlari.com/index.php/component/k2/index.php/mevzuat/genel-yazlar/item/dosyalar/index.php?option=com_k2&view=item&id=16:iso-14001-%C3%A7evre-y%C3%B6netim-sistemi&Itemid=132&limitstart=107120", "http://www.dh-wholesaler.com/MENS-POLO-RACING-TEE-RL-p-417.html", "http://www.employabilityonline.net/", "http://www.esso.inc.ru/board/tools.php?event=profile&pname=Invinerrq", "http://www.filesurgery.ru/searchfw/kids_clothes-3.html", "http://www.furnitureandcarpetsource.com/Item.aspx?ItemID=-2107311899&ItemNum=53-T3048", "http://www.gets.cn/product/Gold-Sand-Lampwork-Glass-Beads--Flat-round--28x28x13mm_p260717.html", "http://www.gets.cn/wholesale-Sterling-Silver-Pendant-Findings-3577_S--L-Star-P-1.html?view=1&by=1", "http://www.homeandgardenadvice.com/diy/Mortgages_Loans_and_Financing/9221.html", "http://www.hongkongairport.com/eng/index.html/passenger/passenger/transport/to-from-airport/business/about-the-airport/transport/shopping/entertainment/t2/passenger/interactive-map.html", "http://www.hongkongairport.com/eng/index.html/shopping/insideshopping/all/passenger/transfer-transit/all/airline-information/shopping/entertainment/t2/business/about-the-airport/welcome.html", "http://www.hongkongairport.com/eng/index.html/transport/business/about-the-airport/transport/business/airport-authority/passenger/shopping/dining/all/dining.html", "http://www.idedge.com/index.cfm/fuseaction/category.display/category_id/298/index.cfm", "http://www.istanbulburda.com/aramalar.php", "http://www.jewelryinthenet.com/ads/AdDetail.aspx?AdID=1-0311002490689&stid=22-0111001020877", "http://www.johnnydepp.ru/forum/index.php?showtopic=1629&mode=linearplus&view=findpost&p=186977", "http://www.johnnydepp.ru/forum/index.php?showtopic=476&st=60&p=87379&", "http://www.joseleano.com/joomla/index.php/audio", "http://www.kaplicarehberi.com/tag/sakar-ilicali-kaplicalari/feed", "http://www.khaber.com.tr/arama.html?key=%C3%A7avdar", "http://www.kiz-oyunlari1.com/1783/4437/4363/1056/4170/Bump-Copter2-.html", "http://www.kiz-oyunlari1.com/3752/2612/4175/1166/3649/1047/Angelina-Oyunu.html", "http://www.kiz-oyunlari1.com/4266/3630/3665/3286/4121/301/3274/Sinir-Sinekler-.html", "http://www.kuldiga.lv/index.php?f=8&cat=371", "http://www.kuldiga.lv/index.php/img/index.php?l=lv&art_id=1836&show_c=&cat=85", "http://www.patronessa.ru/remontiruemsya/kuzovnie30raboti.html", "http://www.rapdict.org/Nu_Money?title=Talk:Nu_Money&action=edit", "http://www.serafin-phu.tabor24.com/?page=8", "http://www.shoes-store.org/brand1/Kids/Minnetonka.html", "http://www.shoes-store.org/shoes-store.xml", "http://www.way2allah.com/khotab-download-34695.htm"}; data.clear(); data.insert(data.begin(), urls, urls + Y_ARRAY_SIZE(urls)); } @@ -128,7 +128,7 @@ class TGreedyDictTest: public TTestBase { typedef THashMap<TStringBuf, NGreedyDict::TEntry> TDict; TAutoPtr<NGreedyDict::TEntrySet> DoTestBuilder(const NGreedyDict::TBuildSettings& s, - TDict& res) { + TDict& res) { using namespace NGreedyDict; TStringBufs data; |