diff options
| author | YDBot <[email protected]> | 2026-05-30 05:58:08 +0000 |
|---|---|---|
| committer | YDBot <[email protected]> | 2026-05-30 05:58:08 +0000 |
| commit | b97d60a64450581ebafda99bfc2b31b87db63041 (patch) | |
| tree | 2c9a022cf9d8e51e706f270820cd93499a98d2d5 /contrib/python/fonttools/fontTools/misc/iftSparseBitSet.py | |
| parent | c0e1f899615cd640410ccc197e35a94f131c9059 (diff) | |
| parent | dc0819dcad0731a4598752bb04f56b2f732310d1 (diff) | |
Merge pull request #41927 from ydb-platform/merge-rightlib-260530-0121
Diffstat (limited to 'contrib/python/fonttools/fontTools/misc/iftSparseBitSet.py')
| -rw-r--r-- | contrib/python/fonttools/fontTools/misc/iftSparseBitSet.py | 311 |
1 files changed, 311 insertions, 0 deletions
diff --git a/contrib/python/fonttools/fontTools/misc/iftSparseBitSet.py b/contrib/python/fonttools/fontTools/misc/iftSparseBitSet.py new file mode 100644 index 00000000000..fc0b2ba2156 --- /dev/null +++ b/contrib/python/fonttools/fontTools/misc/iftSparseBitSet.py @@ -0,0 +1,311 @@ +"""Sparse Bit Set encoding/decoding for IFT (Incremental Font Transfer). + +Implements the sparse bit set format defined in the W3C IFT specification: +https://w3c.github.io/IFT/Overview.html#sparse-bit-set-decoding +""" + +# Reference: https://github.com/googlefonts/fontations/blob/main/read-fonts/src/collections/int_set/sparse_bit_set.rs + + +import bisect +from collections import deque +from typing import Dict, Iterable, Optional, Set, Tuple + +# Maximum tree height that fits within a 32-bit leaf node for each branching factor. +_BF_MAX_HEIGHT: Dict[int, int] = {2: 31, 4: 16, 8: 11, 32: 7} + + +class SparseBitSetDecodeError(Exception): + pass + + +def decode( + data: bytes, bias: int = 0, maxValue: int = 0xFFFFFFFF +) -> Tuple[Set[int], int]: + """Decode a sparse bit set from binary data. + + Args: + data: bytes-like object containing the sparse bit set encoding. + bias: integer added to each decoded value. + + Returns: + A tuple (values, bytesConsumed) where values is a set of integers + and bytesConsumed is the number of bytes read from data. + """ + if not data: + raise SparseBitSetDecodeError("Empty data") + + branchFactor, height = _decodeHeader(data[0]) + + maxHeight = _BF_MAX_HEIGHT[branchFactor] + if height > maxHeight: + raise SparseBitSetDecodeError( + f"Height {height} exceeds max {maxHeight} for branch factor {branchFactor}" + ) + + return _decodeImpl(data, branchFactor, height, bias, maxValue) + + +def _decodeImpl( + data: bytes, branchFactor: int, height: int, bias: int, maxValue: int +) -> Tuple[Set[int], int]: + if height == 0: + # 1 byte was used for the header. + return (set(), 1) + + bitStream = _InputBitStream(data, branchFactor) + result: Set[int] = set() + # Queue entries are (startValue, depth), where startValue is the first + # integer that could be covered by this node. + queue: deque[Tuple[int, int]] = deque() + queue.append((0, 1)) + + while queue: + start, depth = queue.popleft() + bits = bitStream.next() + if bits is None: + raise SparseBitSetDecodeError("Unexpected end of data") + + # all bits were were zero which is a special command to completely fill + # in all integers covered by this node. + if bits == 0: + exp = height - depth + 1 + nodeSize = branchFactor**exp + fillStart = start + bias + if fillStart > maxValue: + continue + fillEnd = min(maxValue, start + nodeSize - 1 + bias) + if fillStart < 0: + fillStart = 0 + if fillStart <= fillEnd: + result.update(range(fillStart, fillEnd + 1)) + continue + + # Non-zero node: each set bit identifies a child/value. + exp = height - depth + nextNodeSize = branchFactor**exp + while True: + bitIndex = _trailingZeros(bits, 32) + if bitIndex == 32: + break + if depth == height: + val = start + bitIndex + bias + if val > maxValue: + queue.clear() + break + if val >= 0: + result.add(val) + else: + startDelta = bitIndex * nextNodeSize + queue.append((start + startDelta, depth + 1)) + bits &= ~(1 << bitIndex) + + return (result, bitStream.bytesConsumed()) + + +def encode(values: Iterable[int]) -> bytes: + """Encode a set of integers as a sparse bit set. + + Tries all branching factors and returns the shortest encoding. + + Args: + values: iterable of non-negative integers. + + Returns: + bytes containing the sparse bit set encoding. + """ + valuesSorted = sorted(set(values)) + if not valuesSorted: + return _encodeHeader(2, 0) + + maxValue = valuesSorted[-1] + valueSet = set(valuesSorted) + + best: Optional[bytes] = None + for branchFactor in (2, 4, 8, 32): + height = _treeHeight(branchFactor, maxValue) + if height > _BF_MAX_HEIGHT[branchFactor]: + continue + encoded = _encodeWithBf(valueSet, branchFactor, height) + if best is None or len(encoded) < len(best): + best = encoded + + if best is None: + raise ValueError(f"Cannot encode max value {maxValue}") + return best + + +def _encodeHeader(branchFactor: int, height: int) -> bytes: + branchFactorToId = {2: 0, 4: 1, 8: 2, 32: 3} + return bytes([(height << 2) | branchFactorToId[branchFactor]]) + + +def _decodeHeader(headerByte: int) -> Tuple[int, int]: + id = headerByte & 0x03 + idToBranchFactor = {0: 2, 1: 4, 2: 8, 3: 32} + height = (headerByte >> 2) & 0x1F + return idToBranchFactor[id], height + + +def _treeHeight(branchFactor: int, maxValue: int) -> int: + """Return the minimum tree height needed to represent maxValue.""" + height = 1 + capacity = branchFactor + while capacity <= maxValue: + capacity *= branchFactor + height += 1 + return height + + +def _encodeWithBf(valueSet: Set[int], branchFactor: int, height: int) -> bytes: + if height == 0: + return _encodeHeader(branchFactor, 0) + + # Build layers bottom-up: layer 0 = leaves (individual values), + # each higher layer groups bf children into one parent bitmask. + layers: list[Dict[int, int]] = [{}] # list of dicts: nodeIndex -> bitmask + + for v in valueSet: + nodeIndex = v // branchFactor + bitPos = v % branchFactor + if nodeIndex not in layers[0]: + layers[0][nodeIndex] = 0 + layers[0][nodeIndex] |= 1 << bitPos + + for _ in range(1, height): + prevLayer = layers[-1] + newLayer: Dict[int, int] = {} + for nodeIndex, bitmask in prevLayer.items(): + parentIndex = nodeIndex // branchFactor + bitPos = nodeIndex % branchFactor + if parentIndex not in newLayer: + newLayer[parentIndex] = 0 + newLayer[parentIndex] |= 1 << bitPos + layers.append(newLayer) + + # For zero-node optimization: track count of values in sorted list. + valuesSorted = sorted(valueSet) + + def rangeCount(lo: int, hi: int) -> int: + return bisect.bisect_right(valuesSorted, hi) - bisect.bisect_left( + valuesSorted, lo + ) + + # Emit nodes BFS order (root to leaves). + # Queue entries: (nodeIndex, depthFromRoot, rangeStart, rangeEnd) + stream = _OutputBitStream(branchFactor) + subtreeSize = branchFactor**height + queue: deque[Tuple[int, int, int, int]] = deque([(0, 0, 0, subtreeSize - 1)]) + + while queue: + nodeIndex, depth, rangeStart, rangeEnd = queue.popleft() + layerIdx = height - 1 - depth # layers[0]=leaves, layers[height-1]=root + + bitmask = ( + layers[layerIdx].get(nodeIndex, 0) if 0 <= layerIdx < len(layers) else 0 + ) + + # Zero-node optimization: if entire range is filled on an INTERNAL node, + # write 0 and skip children. At leaf level we always write the explicit + # bitmask so the encoding matches the reference. + if ( + depth < height - 1 + and rangeCount(rangeStart, rangeEnd) == rangeEnd - rangeStart + 1 + ): + stream.write(0) + continue + + stream.write(bitmask) + + if bitmask != 0 and depth < height - 1: + childSize = (rangeEnd - rangeStart + 1) // branchFactor + bits = bitmask + while bits: + bitIndex = _trailingZeros(bits, 32) + childIndex = nodeIndex * branchFactor + bitIndex + childStart = rangeStart + bitIndex * childSize + childEnd = childStart + childSize - 1 + queue.append((childIndex, depth + 1, childStart, childEnd)) + bits &= ~(1 << bitIndex) + + return _encodeHeader(branchFactor, height) + stream.toBytes() + + +def _trailingZeros(val: int, maxBits: int) -> int: + if val == 0: + return maxBits + count = 0 + while (val & 1) == 0: + val >>= 1 + count += 1 + return count + + +class _InputBitStream: + """Reads bit nodes from a byte array, starting after the header byte.""" + + def __init__(self, data: bytes, branchFactor: int): + self.data = data + self.branchFactor = branchFactor + self.byteIndex = 1 # skip the header byte + self.subIndex = 0 # bit offset within current byte (for bf=2,4) + + def next(self) -> Optional[int]: + if self.branchFactor in (2, 4): + if self.byteIndex >= len(self.data): + return None + mask = (1 << self.branchFactor) - 1 + val = (self.data[self.byteIndex] >> self.subIndex) & mask + self.subIndex += self.branchFactor + if self.subIndex >= 8: + self.subIndex = 0 + self.byteIndex += 1 + return val + elif self.branchFactor == 8: + if self.byteIndex >= len(self.data): + return None + val = self.data[self.byteIndex] + self.byteIndex += 1 + return val + elif self.branchFactor == 32: + if self.byteIndex + 3 >= len(self.data): + return None + b = self.data + i = self.byteIndex + val = b[i] | (b[i + 1] << 8) | (b[i + 2] << 16) | (b[i + 3] << 24) + self.byteIndex += 4 + return val + return None + + def bytesConsumed(self) -> int: + return self.byteIndex + (1 if self.subIndex > 0 else 0) + + +class _OutputBitStream: + """Writes bit nodes into a byte array.""" + + def __init__(self, branchFactor: int): + self.branchFactor = branchFactor + self.data = bytearray() + self.subIndex = 0 # bit offset within current byte (for bf=2,4) + + def write(self, value: int) -> None: + if self.branchFactor in (2, 4): + mask = (1 << self.branchFactor) - 1 + value &= mask + if self.subIndex == 0: + self.data.append(0) + self.data[-1] |= value << self.subIndex + self.subIndex += self.branchFactor + if self.subIndex >= 8: + self.subIndex = 0 + elif self.branchFactor == 8: + self.data.append(value & 0xFF) + elif self.branchFactor == 32: + self.data.append(value & 0xFF) + self.data.append((value >> 8) & 0xFF) + self.data.append((value >> 16) & 0xFF) + self.data.append((value >> 24) & 0xFF) + + def toBytes(self) -> bytes: + return bytes(self.data) |
