summaryrefslogtreecommitdiffstats
path: root/contrib/python/wcwidth
diff options
context:
space:
mode:
authorrobot-piglet <[email protected]>2026-02-11 10:55:54 +0300
committerrobot-piglet <[email protected]>2026-02-11 11:24:08 +0300
commit9b5f29efa00bba424cd32471a95ececc583fe046 (patch)
tree58936c6fc2147c49fc2a4aec657f63fb6f789336 /contrib/python/wcwidth
parentdf75a44af0e3c0cfce907e22f61d6c91fc3bbc39 (diff)
Intermediate changes
commit_hash:721c786fcb8a37574bec0881ae2194859f790fae
Diffstat (limited to 'contrib/python/wcwidth')
-rw-r--r--contrib/python/wcwidth/py3/.dist-info/METADATA14
-rw-r--r--contrib/python/wcwidth/py3/tests/test_benchmarks.py56
-rw-r--r--contrib/python/wcwidth/py3/tests/test_grapheme.py111
-rw-r--r--contrib/python/wcwidth/py3/tests/test_textwrap.py69
-rw-r--r--contrib/python/wcwidth/py3/wcwidth/__init__.py6
-rw-r--r--contrib/python/wcwidth/py3/wcwidth/grapheme.py119
-rw-r--r--contrib/python/wcwidth/py3/wcwidth/textwrap.py39
-rw-r--r--contrib/python/wcwidth/py3/ya.make2
8 files changed, 403 insertions, 13 deletions
diff --git a/contrib/python/wcwidth/py3/.dist-info/METADATA b/contrib/python/wcwidth/py3/.dist-info/METADATA
index de002938d9d..c80ecb6d722 100644
--- a/contrib/python/wcwidth/py3/.dist-info/METADATA
+++ b/contrib/python/wcwidth/py3/.dist-info/METADATA
@@ -1,6 +1,6 @@
Metadata-Version: 2.4
Name: wcwidth
-Version: 0.3.5
+Version: 0.4.0
Summary: Measures the displayed width of unicode strings in a terminal
Project-URL: Homepage, https://github.com/jquast/wcwidth
Author-email: Jeff Quast <[email protected]>
@@ -78,8 +78,10 @@ Text-justification is solved by the grapheme and sequence-aware functions `ljust
of the same names.
The iterator functions `iter_graphemes()`_ and `iter_sequences()`_ allow for careful navigation of
-grapheme and terminal control sequence boundaries. The `clip()`_ function extracts substrings by
-display column positions, and `strip_sequences()`_ removes terminal escape sequences from text.
+grapheme and terminal control sequence boundaries. `iter_graphemes_reverse()`_, and
+`grapheme_boundary_before()`_ are useful for editing and searching of complex unicode. The
+`clip()`_ function extracts substrings by display column positions, and `strip_sequences()`_ removes
+terminal escape sequences from text altogether.
Discrepancies
-------------
@@ -472,6 +474,10 @@ languages.
History
=======
+0.4.0 *2026-01-25*
+ * **New** Functions `iter_graphemes_reverse()`_, `grapheme_boundary_before()`_.
+ * **Bugfix** OSC Hyperlinks should not be broken by ``wrap()``
+
0.3.5 *2026-01-24*
* **Bugfix** packaging of 0.3.4 contains a failing test.
@@ -690,6 +696,8 @@ https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c::
.. _`wcswidth()`: https://wcwidth.readthedocs.io/en/latest/api.html#wcwidth.wcswidth
.. _`width()`: https://wcwidth.readthedocs.io/en/latest/api.html#wcwidth.width
.. _`iter_graphemes()`: https://wcwidth.readthedocs.io/en/latest/api.html#wcwidth.iter_graphemes
+.. _`iter_graphemes_reverse()`: https://wcwidth.readthedocs.io/en/latest/api.html#wcwidth.iter_graphemes_reverse
+.. _`grapheme_boundary_before()`: https://wcwidth.readthedocs.io/en/latest/api.html#wcwidth.grapheme_boundary_before
.. _`ljust()`: https://wcwidth.readthedocs.io/en/latest/api.html#wcwidth.ljust
.. _`rjust()`: https://wcwidth.readthedocs.io/en/latest/api.html#wcwidth.rjust
.. _`center()`: https://wcwidth.readthedocs.io/en/latest/api.html#wcwidth.center
diff --git a/contrib/python/wcwidth/py3/tests/test_benchmarks.py b/contrib/python/wcwidth/py3/tests/test_benchmarks.py
index 5c929f7ec37..6e0ffadcc45 100644
--- a/contrib/python/wcwidth/py3/tests/test_benchmarks.py
+++ b/contrib/python/wcwidth/py3/tests/test_benchmarks.py
@@ -1,6 +1,7 @@
"""Performance benchmarks for wcwidth module."""
# std imports
import os
+import unicodedata
# local
import wcwidth
@@ -54,6 +55,31 @@ def test_wcswidth_emoji_sequence(benchmark):
benchmark(wcwidth.wcswidth, text)
+# NFC vs NFD comparison - text with combining marks
+DIACRITICS_COMPOSED = 'café résumé naïve ' * 100
+DIACRITICS_DECOMPOSED = unicodedata.normalize('NFD', DIACRITICS_COMPOSED)
+
+
+def test_wcswidth_composed(benchmark):
+ """Benchmark wcswidth() with NFC-composed text."""
+ benchmark(wcwidth.wcswidth, DIACRITICS_COMPOSED)
+
+
+def test_wcswidth_decomposed(benchmark):
+ """Benchmark wcswidth() with NFD-decomposed text."""
+ benchmark(wcwidth.wcswidth, DIACRITICS_DECOMPOSED)
+
+
+def test_width_composed(benchmark):
+ """Benchmark width() with NFC-composed text."""
+ benchmark(wcwidth.width, DIACRITICS_COMPOSED)
+
+
+def test_width_decomposed(benchmark):
+ """Benchmark width() with NFD-decomposed text."""
+ benchmark(wcwidth.width, DIACRITICS_DECOMPOSED)
+
+
def test_width_ascii(benchmark):
"""Benchmark width() with ASCII string."""
benchmark(wcwidth.width, 'hello world')
@@ -88,6 +114,36 @@ def test_iter_graphemes_combining(benchmark):
benchmark(lambda: list(wcwidth.iter_graphemes(text)))
+def test_grapheme_boundary_before_short(benchmark):
+ """Benchmark grapheme_boundary_before() near start of short string."""
+ text = 'Hello 👋🏻!'
+ benchmark(wcwidth.grapheme_boundary_before, text, 8)
+
+
+def test_grapheme_boundary_before_long_end(benchmark):
+ """Benchmark grapheme_boundary_before() near end of long line."""
+ text = 'x' * 95 + '👨\u200d👩\u200d👧!'
+ benchmark(wcwidth.grapheme_boundary_before, text, 100)
+
+
+def test_grapheme_boundary_before_long_mid(benchmark):
+ """Benchmark grapheme_boundary_before() in middle of long line."""
+ text = 'x' * 50 + '👨\u200d👩\u200d👧' + 'y' * 50
+ benchmark(wcwidth.grapheme_boundary_before, text, 55)
+
+
+def test_iter_graphemes_reverse_short(benchmark):
+ """Benchmark iter_graphemes_reverse() with short string."""
+ text = 'café\u0301 🇫🇷!'
+ benchmark(lambda: list(wcwidth.iter_graphemes_reverse(text)))
+
+
+def test_iter_graphemes_reverse_long(benchmark):
+ """Benchmark iter_graphemes_reverse() with long string."""
+ text = 'The quick brown 🦊 jumps over the lazy 🐕. ' * 5
+ benchmark(lambda: list(wcwidth.iter_graphemes_reverse(text)))
+
+
def test_ljust_ascii(benchmark):
"""Benchmark ljust() with ASCII string."""
benchmark(wcwidth.ljust, 'hello', 20)
diff --git a/contrib/python/wcwidth/py3/tests/test_grapheme.py b/contrib/python/wcwidth/py3/tests/test_grapheme.py
index f344ad32fde..d2cfa86c1c8 100644
--- a/contrib/python/wcwidth/py3/tests/test_grapheme.py
+++ b/contrib/python/wcwidth/py3/tests/test_grapheme.py
@@ -6,7 +6,7 @@ import os
import pytest
# local
-from wcwidth import iter_graphemes
+from wcwidth import iter_graphemes, iter_graphemes_reverse, grapheme_boundary_before
try:
chr(0x2fffe)
@@ -145,3 +145,112 @@ def test_wide_unicode_graphemes(input_str, expected):
def test_unicode_grapheme_break_test(input_str, expected):
"""Validate against official Unicode GraphemeBreakTest.txt."""
assert list(iter_graphemes(input_str)) == expected
+
+
+# Prepend: Arabic Number Sign
+PREPEND_CHAR = '\u0600'
+# Multiple combining marks: e + acute + grave
+MULTI_COMBINE = 'e\u0301\u0300'
+
+
+# grapheme_boundary_before(text, pos) returns start of grapheme cluster before pos.
+# (text, pos, expected): pos=search from here, expected=where cluster starts
[email protected](("text", "pos", "expected"), [
+ # 'abc': 0=a, 1=b, 2=c
+ ('abc', 3, 2), # from end -> 'c' at 2
+ ('abc', 2, 1), # from 'c' -> 'b' at 1
+ ('abc', 1, 0), # from 'b' -> 'a' at 0
+ # 'a\r\nb': CRLF is one cluster (GB3)
+ ('a\r\nb', 3, 1), # from 'b' -> '\r\n' at 1
+ # 'café': e + combining acute is one cluster (GB9)
+ ('cafe\u0301', 5, 3), # from end -> 'é' at 3
+ ('cafe\u0301', 4, 3), # from acute -> still 'é' at 3
+ # Multiple combining marks: e + acute + grave (GB9)
+ ('a' + MULTI_COMBINE + 'b', 4, 1), # from 'b' -> e+marks at 1
+ # Prepend + char is one cluster (GB9b)
+ (PREPEND_CHAR + 'a', 2, 0), # whole cluster
+ # Prepend + Control: control breaks (GB4)
+ (PREPEND_CHAR + '\n', 2, 1), # '\n' separate at 1
+ # C1 control (NEL, 0x85) stops backward scan in _find_cluster_start (GB4)
+ ('X\x85\u0301', 3, 2),
+])
+def test_grapheme_boundary_before_basic(text, pos, expected):
+ """Basic grapheme_boundary_before tests."""
+ assert grapheme_boundary_before(text, pos) == expected
+
+
[email protected](NARROW_ONLY, reason="requires wide Unicode")
[email protected](("text", "pos", "expected"), [
+ # 'Hi 👋🏻!': 0=H,1=i,2=space,3=wave,4=skin,5=!; wave+skin is one cluster
+ ('Hi \U0001F44B\U0001F3FB!', 6, 5), # from end -> '!' at 5
+ ('Hi \U0001F44B\U0001F3FB!', 5, 3), # from '!' -> wave+skin at 3
+ ('Hi \U0001F44B\U0001F3FB!', 3, 2), # from wave -> space at 2
+ # 'a🇺🇸b': 0=a,1-2=flag,3=b; flag is one cluster (GB12/13)
+ ('a' + FLAG_US + 'b', 4, 3), # from end -> 'b' at 3
+ ('a' + FLAG_US + 'b', 3, 1), # from 'b' -> flag at 1
+ # Three RIs (🇺🇸🇦): flag + solo RI
+ (FLAG_US + RI_A, 3, 2), # from end -> solo RI at 2
+ (FLAG_US + RI_A, 2, 0), # from solo -> flag at 0
+ # 'a👨‍👩‍👧b': 0=a,1-5=family,6=b; ZWJ sequence is one cluster (GB11)
+ ('a' + FAMILY + 'b', 7, 6), # from end -> 'b' at 6
+ ('a' + FAMILY + 'b', 6, 1), # from 'b' -> family at 1
+])
+def test_grapheme_boundary_before_unicode(text, pos, expected):
+ """grapheme_boundary_before with emoji and wide Unicode."""
+ assert grapheme_boundary_before(text, pos) == expected
+
+
[email protected](("input_str", "expected"), [
+ ('', []),
+ ('abc', ['c', 'b', 'a']),
+ # café with combining mark mixed with CRLF
+ ('cafe\u0301\r\nok', ['k', 'o', '\r\n', 'e\u0301', 'f', 'a', 'c']),
+])
+def test_iter_graphemes_reverse_basic(input_str, expected):
+ """Basic iter_graphemes_reverse tests."""
+ assert list(iter_graphemes_reverse(input_str)) == expected
+
+
[email protected](NARROW_ONLY, reason="requires wide Unicode")
[email protected](("input_str", "expected"), [
+ # Multiple emoji types in one string
+ ('cafe\u0301 ' + WAVE_SKIN + ' ' + FLAG_US + '!',
+ ['!', FLAG_US, ' ', WAVE_SKIN, ' ', 'e\u0301', 'f', 'a', 'c']),
+ # Two families
+ (FAMILY + FAMILY, [FAMILY, FAMILY]),
+ # Flag + solo RI + text
+ ('Hi' + FLAG_US + RI_A + '!', ['!', RI_A, FLAG_US, 'i', 'H']),
+])
+def test_iter_graphemes_reverse_unicode(input_str, expected):
+ """iter_graphemes_reverse with wide Unicode."""
+ assert list(iter_graphemes_reverse(input_str)) == expected
+
+
[email protected](NARROW_ONLY, reason="requires wide Unicode")
[email protected](("input_str", "expected"), read_grapheme_break_test())
+def test_grapheme_roundtrip_consistency(input_str, expected):
+ """Forward and reverse iteration produce identical boundaries."""
+ forward = list(iter_graphemes(input_str))
+ reverse = list(iter_graphemes_reverse(input_str))[::-1]
+ assert forward == reverse
+
+
+def test_grapheme_boundary_before_edge_cases():
+ """Edge cases for grapheme_boundary_before."""
+ assert grapheme_boundary_before('abc', 0) == 0
+ assert grapheme_boundary_before('abc', 100) == 2 # pos > len clamps
+ assert grapheme_boundary_before('', 0) == 0
+
+
+def test_iter_graphemes_reverse_edge_cases():
+ """Edge cases for iter_graphemes_reverse."""
+ assert list(iter_graphemes_reverse('abcdef', start=2, end=5)) == ['e', 'd', 'c']
+ assert list(iter_graphemes_reverse('abc', start=0, end=100)) == ['c', 'b', 'a']
+ assert not list(iter_graphemes_reverse('abc', start=5))
+ assert not list(iter_graphemes_reverse('abc', start=2, end=2))
+ # PREPEND + char is one grapheme (GB9b), so start=1 yields nothing (won't split)
+ assert not list(iter_graphemes_reverse(PREPEND_CHAR + 'a', start=1))
+ # But start=0 yields the full grapheme
+ assert list(iter_graphemes_reverse(PREPEND_CHAR + 'a', start=0)) == [PREPEND_CHAR + 'a']
+ # Negative start is clamped to 0
+ assert list(iter_graphemes_reverse('abc', start=-5)) == ['c', 'b', 'a']
diff --git a/contrib/python/wcwidth/py3/tests/test_textwrap.py b/contrib/python/wcwidth/py3/tests/test_textwrap.py
index c2f28bffe1a..fc15f1917f9 100644
--- a/contrib/python/wcwidth/py3/tests/test_textwrap.py
+++ b/contrib/python/wcwidth/py3/tests/test_textwrap.py
@@ -12,6 +12,7 @@ from wcwidth import iter_sequences
from wcwidth.textwrap import SequenceTextWrapper, wrap
SGR_RED = '\x1b[31m'
+SGR_BLUE = '\x1b[34m'
SGR_BOLD = '\x1b[1m'
SGR_RESET = '\x1b[0m'
ATTRS = ('\x1b[31m', '\x1b[34m', '\x1b[4m', '\x1b[7m', '\x1b[41m', '\x1b[37m', '\x1b[107m')
@@ -203,7 +204,7 @@ SEQUENCE_CASES = [
# Empty/adjacent sequences
(f'{SGR_RED}{SGR_RESET}', 10, [f'{SGR_RED}{SGR_RESET}']),
(f'hello {SGR_RED}{SGR_RESET}world', 6, ['hello', f'{SGR_RED}{SGR_RESET}world']),
- # OSC hyperlinks
+ # OSC hyperlinks (with space separator)
(f'{OSC_HYPERLINK} text', 5, [OSC_HYPERLINK, 'text']),
# CSI cursor sequences
(f'{CSI_CURSOR}text here', 10, [f'{CSI_CURSOR}text', 'here']),
@@ -262,3 +263,69 @@ TABSIZE_WIDE_CASES = [
def test_wrap_tabsize_wide_chars(text, w, tabsize, expected):
"""Verify tabsize respects wide character column positions."""
assert wrap(text, w, tabsize=tabsize) == expected
+
+
+OSC_START_ST = '\x1b]8;;http://example.com\x1b\\'
+OSC_END_ST = '\x1b]8;;\x1b\\'
+OSC_START_BEL = '\x1b]8;;http://example.com\x07'
+OSC_END_BEL = '\x1b]8;;\x07'
+
+HYPERLINK_WORD_BOUNDARY_CASES = [
+ ( # standard, ST-variant,
+ f'{OSC_START_ST}link{OSC_END_ST}more',
+ 5,
+ [f'{OSC_START_ST}link{OSC_END_ST}', 'more'],
+ ),
+ ( # BEL-variant,
+ f'{OSC_START_BEL}link{OSC_END_BEL}more',
+ 5,
+ [f'{OSC_START_BEL}link{OSC_END_BEL}', 'more'],
+ ),
+ ( # hyperlink breaks after word, 'prefix',
+ f'prefix{OSC_START_ST}link{OSC_END_ST}',
+ 6,
+ ['prefix', f'{OSC_START_ST}link{OSC_END_ST}'],
+ ),
+ (
+ f'prefix{OSC_START_BEL}link{OSC_END_BEL}',
+ 6,
+ ['prefix', f'{OSC_START_BEL}link{OSC_END_BEL}'],
+ ),
+ ( # hyperlink breaks before following, 'suffix',
+ f'prefix{OSC_START_ST}link{OSC_END_ST}suffix',
+ 6,
+ ['prefix', f'{OSC_START_ST}link{OSC_END_ST}', 'suffix'],
+ ),
+ (
+ f'prefix{OSC_START_BEL}link{OSC_END_BEL}suffix',
+ 6,
+ ['prefix', f'{OSC_START_BEL}link{OSC_END_BEL}', 'suffix'],
+ ),
+ ( # hyperlink *surrounded* by SGR attributes
+ f'foo {SGR_RED}{OSC_START_ST}link{OSC_END_ST}{SGR_RESET} bar',
+ 6,
+ ['foo', f'{SGR_RED}{OSC_START_ST}link{OSC_END_ST}{SGR_RESET}', 'bar'],
+ ),
+ (
+ f'foo {SGR_RED}{OSC_START_BEL}link{OSC_END_BEL}{SGR_RESET} bar',
+ 6,
+ ['foo', f'{SGR_RED}{OSC_START_BEL}link{OSC_END_BEL}{SGR_RESET}', 'bar'],
+ ),
+ ( # hyperlink *containing* SGR attributes
+ f'foo {OSC_START_ST}{SGR_RED}link{SGR_RESET}{OSC_END_ST} bar',
+ 6,
+ ['foo', f'{OSC_START_ST}{SGR_RED}link{SGR_RESET}{OSC_END_ST}', 'bar'],
+ ),
+ (
+ f'foo {OSC_START_BEL}{SGR_RED}link{SGR_RESET}{OSC_END_BEL} bar',
+ 6,
+ ['foo', f'{OSC_START_BEL}{SGR_RED}link{SGR_RESET}{OSC_END_BEL}', 'bar'],
+ ),
+]
+
+
[email protected]('text,w,expected', HYPERLINK_WORD_BOUNDARY_CASES)
+def test_wrap_hyperlink_word_boundary(text, w, expected):
+ """OSC hyperlink sequences should act as word boundaries."""
+ result = wrap(text, w)
+ assert result == expected
diff --git a/contrib/python/wcwidth/py3/wcwidth/__init__.py b/contrib/python/wcwidth/py3/wcwidth/__init__.py
index ed29279dcb8..03279ff863b 100644
--- a/contrib/python/wcwidth/py3/wcwidth/__init__.py
+++ b/contrib/python/wcwidth/py3/wcwidth/__init__.py
@@ -26,16 +26,18 @@ from .wcwidth import (WIDE_EASTASIAN,
_wcmatch_version,
_wcversion_value)
from .bisearch import bisearch as _bisearch
-from .grapheme import iter_graphemes # noqa
+from .grapheme import grapheme_boundary_before # noqa
+from .grapheme import iter_graphemes, iter_graphemes_reverse
from .textwrap import SequenceTextWrapper, wrap
# The __all__ attribute defines the items exported from statement,
# 'from wcwidth import *', but also to say, "This is the public API".
__all__ = ('wcwidth', 'wcswidth', 'width', 'iter_sequences', 'iter_graphemes',
+ 'iter_graphemes_reverse', 'grapheme_boundary_before',
'ljust', 'rjust', 'center', 'wrap', 'clip', 'strip_sequences',
'list_versions')
# We also used pkg_resources to load unicode version tables from version.json,
# generated by bin/update-tables.py, but some environments are unable to
# import pkg_resources for one reason or another, yikes!
-__version__ = '0.3.5'
+__version__ = '0.4.0'
diff --git a/contrib/python/wcwidth/py3/wcwidth/grapheme.py b/contrib/python/wcwidth/py3/wcwidth/grapheme.py
index 1a83668b066..63713b9070d 100644
--- a/contrib/python/wcwidth/py3/wcwidth/grapheme.py
+++ b/contrib/python/wcwidth/py3/wcwidth/grapheme.py
@@ -36,6 +36,10 @@ if TYPE_CHECKING: # pragma: no cover
# std imports
from collections.abc import Iterator
+# Maximum backward scan distance when finding grapheme cluster boundaries.
+# Covers all known Unicode grapheme clusters with margin; longer sequences are pathological.
+MAX_GRAPHEME_SCAN = 32
+
class GCB(IntEnum):
"""Grapheme Cluster Break property values."""
@@ -304,3 +308,118 @@ def iter_graphemes(
# Yield the final cluster
yield unistr[cluster_start:end]
+
+
+def _find_cluster_start(text: str, pos: int) -> int:
+ """
+ Find the start of the grapheme cluster containing the character before pos.
+
+ Scans backwards from pos to find a safe starting point, then iterates forward using standard
+ break rules to find the actual cluster boundary.
+
+ :param text: The Unicode string.
+ :param pos: Position to search before (exclusive).
+ :returns: Start position of the grapheme cluster.
+ """
+ target_cp = ord(text[pos - 1])
+
+ # GB3: CR x LF - LF after CR is part of same cluster
+ if target_cp == 0x0A and pos >= 2 and text[pos - 2] == '\r':
+ return pos - 2
+
+ # Fast path: ASCII (except LF) starts its own cluster
+ if target_cp < 0x80:
+ # GB9b: Check for preceding PREPEND (rare: Arabic/Brahmic)
+ if pos >= 2 and target_cp >= 0x20:
+ prev_cp = ord(text[pos - 2])
+ if prev_cp >= 0x80 and _grapheme_cluster_break(prev_cp) == GCB.PREPEND:
+ return _find_cluster_start(text, pos - 1)
+ return pos - 1
+
+ # Scan backward to find a safe starting point
+ safe_start = pos - 1
+ while safe_start > 0 and (pos - safe_start) < MAX_GRAPHEME_SCAN:
+ cp = ord(text[safe_start])
+ if 0x20 <= cp < 0x80: # ASCII always starts a cluster
+ break
+ if _grapheme_cluster_break(cp) == GCB.CONTROL: # GB4
+ break
+ safe_start -= 1
+
+ # Verify forward to find the actual cluster boundary
+ cluster_start = safe_start
+ left_gcb = _grapheme_cluster_break(ord(text[safe_start]))
+ ri_count = 1 if left_gcb == GCB.REGIONAL_INDICATOR else 0
+
+ for i in range(safe_start + 1, pos):
+ right_gcb = _grapheme_cluster_break(ord(text[i]))
+ result = _should_break(left_gcb, right_gcb, text, i, ri_count)
+ ri_count = result.ri_count
+ if result.should_break:
+ cluster_start = i
+ left_gcb = right_gcb
+
+ return cluster_start
+
+
+def grapheme_boundary_before(unistr: str, pos: int) -> int:
+ r"""
+ Find the grapheme cluster boundary immediately before a position.
+
+ :param unistr: The Unicode string to search.
+ :param pos: Position in the string (0 < pos <= len(unistr)).
+ :returns: Start index of the grapheme cluster containing the character at pos-1.
+
+ Example::
+
+ >>> grapheme_boundary_before('Hello \U0001F44B\U0001F3FB', 8)
+ 6
+ >>> grapheme_boundary_before('a\r\nb', 3)
+ 1
+
+ .. versionadded:: 0.3.6
+ """
+ if pos <= 0:
+ return 0
+ return _find_cluster_start(unistr, min(pos, len(unistr)))
+
+
+def iter_graphemes_reverse(
+ unistr: str,
+ start: int = 0,
+ end: int | None = None,
+) -> Iterator[str]:
+ r"""
+ Iterate over grapheme clusters in reverse order (last to first).
+
+ :param unistr: The Unicode string to segment.
+ :param start: Starting index (default 0).
+ :param end: Ending index (default len(unistr)).
+ :yields: Grapheme cluster substrings in reverse order.
+
+ Example::
+
+ >>> list(iter_graphemes_reverse('cafe\u0301'))
+ ['e\u0301', 'f', 'a', 'c']
+
+ .. versionadded:: 0.3.6
+ """
+ if not unistr:
+ return
+
+ length = len(unistr)
+
+ end = length if end is None else min(end, length)
+ start = max(start, 0)
+
+ if start >= end or start >= length:
+ return
+
+ pos = end
+ while pos > start:
+ cluster_start = _find_cluster_start(unistr, pos)
+ # Don't yield partial graphemes that extend before start
+ if cluster_start < start:
+ break
+ yield unistr[cluster_start:pos]
+ pos = cluster_start
diff --git a/contrib/python/wcwidth/py3/wcwidth/textwrap.py b/contrib/python/wcwidth/py3/wcwidth/textwrap.py
index 8b91d6ff9ce..41d89a3dba0 100644
--- a/contrib/python/wcwidth/py3/wcwidth/textwrap.py
+++ b/contrib/python/wcwidth/py3/wcwidth/textwrap.py
@@ -34,6 +34,9 @@ class SequenceTextWrapper(textwrap.TextWrapper):
The key difference from the blessed implementation is the addition of grapheme cluster support
via :func:`~.iter_graphemes`, providing width calculation for ZWJ emoji sequences, VS-16 emojis
and variations, regional indicator flags, and combining characters.
+
+ OSC hyperlink sequences are treated as word boundaries, ensuring that text adjacent to
+ hyperlinks wraps correctly without breaking the hyperlink structure.
"""
def __init__(self, width: int = 70, *,
@@ -77,17 +80,25 @@ class SequenceTextWrapper(textwrap.TextWrapper):
return ''.join(result)
def _split(self, text: str) -> list[str]: # pylint: disable=too-many-locals
- """
+ r"""
Sequence-aware variant of :meth:`textwrap.TextWrapper._split`.
This method ensures that terminal escape sequences don't interfere with the text splitting
logic, particularly for hyphen-based word breaking. It builds a position mapping from
stripped text to original text, calls the parent's _split on stripped text, then maps chunks
back.
+
+ OSC hyperlink sequences are treated as word boundaries::
+
+ >>> wrap('foo \x1b]8;;https://example.com\x07link\x1b]8;;\x07 bar', 6)
+ ['foo', '\x1b]8;;https://example.com\x07link\x1b]8;;\x07', 'bar']
+
+ Both BEL (``\x07``) and ST (``\x1b\\``) terminators are supported.
"""
# pylint: disable=too-many-locals,too-many-branches
# Build a mapping from stripped text positions to original text positions.
- # We track where each character ENDS so that sequences between characters
+ #
+ # Track where each character ENDS so that sequences between characters
# attach to the following text (not preceding text). This ensures sequences
# aren't lost when whitespace is dropped.
#
@@ -95,16 +106,32 @@ class SequenceTextWrapper(textwrap.TextWrapper):
char_end: list[int] = []
stripped_text = ''
original_pos = 0
+ prev_was_hyperlink_close = False
for segment, is_seq in iter_sequences(text):
if not is_seq:
+ # Conditionally insert space after hyperlink close to force word boundary
+ if prev_was_hyperlink_close and segment and not segment[0].isspace():
+ stripped_text += ' '
+ char_end.append(original_pos)
for char in segment:
original_pos += 1
char_end.append(original_pos)
stripped_text += char
+ prev_was_hyperlink_close = False
else:
+ # Conditionally insert space before OSC sequences to artificially create word
+ # boundary, but *not* before hyperlink close sequences, to ensure hyperlink is
+ # terminated on the same line.
+ is_hyperlink_close = segment.startswith(('\x1b]8;;\x1b\\', '\x1b]8;;\x07'))
+ if (segment.startswith('\x1b]') and stripped_text and not
+ stripped_text[-1].isspace()):
+ if not is_hyperlink_close:
+ stripped_text += ' '
+ char_end.append(original_pos)
# Escape sequences advance position but don't add to stripped text
original_pos += len(segment)
+ prev_was_hyperlink_close = is_hyperlink_close
# Add sentinel for final position
char_end.append(original_pos)
@@ -137,7 +164,9 @@ class SequenceTextWrapper(textwrap.TextWrapper):
end_orig = char_end[stripped_pos + chunk_len - 1]
# Extract the corresponding portion from the original text
- result.append(text[start_orig:end_orig])
+ # Skip empty chunks (from virtual spaces inserted at OSC boundaries)
+ if start_orig != end_orig:
+ result.append(text[start_orig:end_orig])
stripped_pos += chunk_len
return result
@@ -303,8 +332,8 @@ class SequenceTextWrapper(textwrap.TextWrapper):
idx = match.end()
continue
- # Get grapheme
- grapheme = next(iter_graphemes(text[idx:]))
+ # Get grapheme (use start= to avoid slice allocation)
+ grapheme = next(iter_graphemes(text, start=idx))
grapheme_width = self._width(grapheme)
if width_so_far + grapheme_width > max_width:
diff --git a/contrib/python/wcwidth/py3/ya.make b/contrib/python/wcwidth/py3/ya.make
index 36c1c8b8ee2..542d8f80c0f 100644
--- a/contrib/python/wcwidth/py3/ya.make
+++ b/contrib/python/wcwidth/py3/ya.make
@@ -2,7 +2,7 @@
PY3_LIBRARY()
-VERSION(0.3.5)
+VERSION(0.4.0)
LICENSE(MIT)