contrib/libs/clang16-rt/lib/sanitizer_common/sanitizer_lzw.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159

//===-- sanitizer_lzw.h -----------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Lempel–Ziv–Welch encoding/decoding
//
//===----------------------------------------------------------------------===//

#ifndef SANITIZER_LZW_H
#define SANITIZER_LZW_H

#include "sanitizer_dense_map.h"

namespace __sanitizer {

using LzwCodeType = u32;

template <class T, class ItIn, class ItOut>
ItOut LzwEncode(ItIn begin, ItIn end, ItOut out) {
  using Substring =
      detail::DenseMapPair<LzwCodeType /* Prefix */, T /* Next input */>;

  // Sentinel value for substrings of len 1.
  static constexpr LzwCodeType kNoPrefix =
      Min(DenseMapInfo<Substring>::getEmptyKey().first,
          DenseMapInfo<Substring>::getTombstoneKey().first) -
      1;
  DenseMap<Substring, LzwCodeType> prefix_to_code;
  {
    // Add all substring of len 1 as initial dictionary.
    InternalMmapVector<T> dict_len1;
    for (auto it = begin; it != end; ++it)
      if (prefix_to_code.try_emplace({kNoPrefix, *it}, 0).second)
        dict_len1.push_back(*it);

    // Slightly helps with later delta encoding.
    Sort(dict_len1.data(), dict_len1.size());

    // For large sizeof(T) we have to store dict_len1. Smaller types like u8 can
    // just generate them.
    *out = dict_len1.size();
    ++out;

    for (uptr i = 0; i != dict_len1.size(); ++i) {
      // Remap after the Sort.
      prefix_to_code[{kNoPrefix, dict_len1[i]}] = i;
      *out = dict_len1[i];
      ++out;
    }
    CHECK_EQ(prefix_to_code.size(), dict_len1.size());
  }

  if (begin == end)
    return out;

  // Main LZW encoding loop.
  LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second;
  ++begin;
  for (auto it = begin; it != end; ++it) {
    // Extend match with the new item.
    auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size());
    if (ins.second) {
      // This is a new substring, but emit the code for the current match
      // (before extend). This allows LZW decoder to recover the dictionary.
      *out = match;
      ++out;
      // Reset the match to a single item, which must be already in the map.
      match = prefix_to_code.find({kNoPrefix, *it})->second;
    } else {
      // Already known, use as the current match.
      match = ins.first->second;
    }
  }

  *out = match;
  ++out;

  return out;
}

template <class T, class ItIn, class ItOut>
ItOut LzwDecode(ItIn begin, ItIn end, ItOut out) {
  if (begin == end)
    return out;

  // Load dictionary of len 1 substrings. Theses correspont to lowest codes.
  InternalMmapVector<T> dict_len1(*begin);
  ++begin;

  if (begin == end)
    return out;

  for (auto& v : dict_len1) {
    v = *begin;
    ++begin;
  }

  // Substrings of len 2 and up. Indexes are shifted because [0,
  // dict_len1.size()) stored in dict_len1. Substings get here after being
  // emitted to the output, so we can use output position.
  InternalMmapVector<detail::DenseMapPair<ItOut /* begin. */, ItOut /* end */>>
      code_to_substr;

  // Copies already emitted substrings into the output again.
  auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) {
    if (code < dict_len1.size()) {
      *out = dict_len1[code];
      ++out;
      return out;
    }
    const auto& s = code_to_substr[code - dict_len1.size()];

    for (ItOut it = s.first; it != s.second; ++it, ++out) *out = *it;
    return out;
  };

  // Returns lens of the substring with the given code.
  auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr {
    if (code < dict_len1.size())
      return 1;
    const auto& s = code_to_substr[code - dict_len1.size()];
    return s.second - s.first;
  };

  // Main LZW decoding loop.
  LzwCodeType prev_code = *begin;
  ++begin;
  out = copy(prev_code, out);
  for (auto it = begin; it != end; ++it) {
    LzwCodeType code = *it;
    auto start = out;
    if (code == dict_len1.size() + code_to_substr.size()) {
      // Special LZW case. The code is not in the dictionary yet. This is
      // possible only when the new substring is the same as previous one plus
      // the first item of the previous substring. We can emit that in two
      // steps.
      out = copy(prev_code, out);
      *out = *start;
      ++out;
    } else {
      out = copy(code, out);
    }

    // Every time encoded emits the code, it also creates substing of len + 1
    // including the first item of the just emmited substring. Do the same here.
    uptr len = code_to_len(prev_code);
    code_to_substr.push_back({start - len, start + 1});

    prev_code = code;
  }
  return out;
}

}  // namespace __sanitizer
#endif