aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/llvm12/lib/Support/TrigramIndex.cpp
blob: 717e5e093d98c56bc9ab45564de3205895d428a7 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
//===-- TrigramIndex.cpp - a heuristic for SpecialCaseList ----------------===// 
// 
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
// See https://llvm.org/LICENSE.txt for license information. 
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
// 
//===----------------------------------------------------------------------===// 
// 
// TrigramIndex implements a heuristic for SpecialCaseList that allows to 
// filter out ~99% incoming queries when all regular expressions in the 
// SpecialCaseList are simple wildcards with '*' and '.'. If rules are more 
// complicated, the check is defeated and it will always pass the queries to a 
// full regex. 
// 
//===----------------------------------------------------------------------===// 
 
#include "llvm/Support/TrigramIndex.h" 
#include <set> 
 
using namespace llvm; 
 
static const char RegexAdvancedMetachars[] = "()^$|+?[]\\{}"; 
 
static bool isAdvancedMetachar(unsigned Char) { 
  return strchr(RegexAdvancedMetachars, Char) != nullptr; 
} 
 
void TrigramIndex::insert(const std::string &Regex) {
  if (Defeated) return; 
  std::set<unsigned> Was; 
  unsigned Cnt = 0; 
  unsigned Tri = 0; 
  unsigned Len = 0; 
  bool Escaped = false; 
  for (unsigned Char : Regex) { 
    if (!Escaped) { 
      // Regular expressions allow escaping symbols by preceding it with '\'. 
      if (Char == '\\') { 
        Escaped = true; 
        continue; 
      } 
      if (isAdvancedMetachar(Char)) { 
        // This is a more complicated regex than we can handle here. 
        Defeated = true; 
        return; 
      } 
      if (Char == '.' || Char == '*') { 
        Tri = 0; 
        Len = 0; 
        continue; 
      } 
    } 
    if (Escaped && Char >= '1' && Char <= '9') { 
      Defeated = true; 
      return; 
    } 
    // We have already handled escaping and can reset the flag. 
    Escaped = false; 
    Tri = ((Tri << 8) + Char) & 0xFFFFFF; 
    Len++; 
    if (Len < 3) 
      continue; 
    // We don't want the index to grow too much for the popular trigrams, 
    // as they are weak signals. It's ok to still require them for the 
    // rules we have already processed. It's just a small additional 
    // computational cost. 
    if (Index[Tri].size() >= 4) 
      continue; 
    Cnt++; 
    if (!Was.count(Tri)) { 
      // Adding the current rule to the index. 
      Index[Tri].push_back(Counts.size()); 
      Was.insert(Tri); 
    } 
  } 
  if (!Cnt) { 
    // This rule does not have remarkable trigrams to rely on. 
    // We have to always call the full regex chain. 
    Defeated = true; 
    return; 
  } 
  Counts.push_back(Cnt); 
} 
 
bool TrigramIndex::isDefinitelyOut(StringRef Query) const { 
  if (Defeated) 
    return false; 
  std::vector<unsigned> CurCounts(Counts.size()); 
  unsigned Tri = 0; 
  for (size_t I = 0; I < Query.size(); I++) { 
    Tri = ((Tri << 8) + Query[I]) & 0xFFFFFF; 
    if (I < 2) 
      continue; 
    const auto &II = Index.find(Tri); 
    if (II == Index.end()) 
      continue; 
    for (size_t J : II->second) { 
      CurCounts[J]++; 
      // If we have reached a desired limit, we have to look at the query 
      // more closely by running a full regex. 
      if (CurCounts[J] >= Counts[J]) 
        return false; 
    } 
  } 
  return true; 
}