aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/clang16/tools/extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp
blob: 9fd718c270e5982644a41545fa2e3c4e7e5dd813 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
//===--- BuildConfusableTable.cpp - clang-tidy---------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/raw_ostream.h"

using namespace llvm;

int main(int argc, char *argv[]) {
  auto ErrorOrBuffer = MemoryBuffer::getFile(argv[1], true);
  if (!ErrorOrBuffer)
    return 1;
  std::unique_ptr<MemoryBuffer> Buffer = std::move(ErrorOrBuffer.get());
  StringRef Content = Buffer->getBuffer();
  Content = Content.drop_until([](char c) { return c == '#'; });
  SmallVector<StringRef> Lines;
  SplitString(Content, Lines, "\r\n");

  std::vector<std::pair<llvm::UTF32, SmallVector<llvm::UTF32>>> Entries;
  SmallVector<StringRef> Values;
  for (StringRef Line : Lines) {
    if (Line.startswith("#"))
      continue;

    Values.clear();
    Line.split(Values, ';');
    if (Values.size() < 2) {
      errs() << "Failed to parse: " << Line << "\n";
      return 2;
    }

    llvm::StringRef From = Values[0].trim();
    llvm::UTF32 CodePoint;
    From.getAsInteger(16, CodePoint);

    SmallVector<llvm::UTF32> To;
    SmallVector<StringRef> ToN;
    Values[1].split(ToN, ' ', -1, false);
    for (StringRef To_ : ToN) {
      llvm::UTF32 ToCodePoint;
      To_.trim().getAsInteger(16, ToCodePoint);
      To.push_back(ToCodePoint);
    }
    // Sentinel
    To.push_back(0);

    Entries.emplace_back(CodePoint, To);
  }
  llvm::sort(Entries);

  unsigned LargestValue =
      std::max_element(Entries.begin(), Entries.end(),
                       [](const auto &Entry0, const auto &Entry1) {
                         return Entry0.second.size() < Entry1.second.size();
                       })
          ->second.size();

  std::error_code ec;
  llvm::raw_fd_ostream os(argv[2], ec);

  // FIXME: If memory consumption and/or lookup time becomes a constraint, it
  // maybe worth using a more elaborate data structure.
  os << "struct {llvm::UTF32 codepoint; llvm::UTF32 values[" << LargestValue
     << "];} "
        "ConfusableEntries[] = {\n";
  for (const auto &Values : Entries) {
    os << "  { ";
    os << Values.first;
    os << ", {";
    for (auto CP : Values.second)
      os << CP << ", ";

    os << "}},\n";
  }
  os << "};\n";
  return 0;
}