Intermediate changes

commit_hash:721c786fcb8a37574bec0881ae2194859f790fae
author: robot-piglet <[email protected]> 2026-02-11 10:55:54 +0300
committer: robot-piglet <[email protected]> 2026-02-11 11:24:08 +0300
commit: 9b5f29efa00bba424cd32471a95ececc583fe046 (patch)
tree: 58936c6fc2147c49fc2a4aec657f63fb6f789336 /contrib/python/wcwidth/py3/tests
parent: df75a44af0e3c0cfce907e22f61d6c91fc3bbc39 (diff)
3 files changed, 234 insertions, 2 deletions
diff --git a/contrib/python/wcwidth/py3/tests/test_benchmarks.py b/contrib/python/wcwidth/py3/tests/test_benchmarks.py
index 5c929f7ec37..6e0ffadcc45 100644
--- a/contrib/python/wcwidth/py3/tests/test_benchmarks.py
+++ b/contrib/python/wcwidth/py3/tests/test_benchmarks.py
@@ -1,6 +1,7 @@
 """Performance benchmarks for wcwidth module."""
 # std imports
 import os
+import unicodedata
 
 # local
 import wcwidth
@@ -54,6 +55,31 @@ def test_wcswidth_emoji_sequence(benchmark):
     benchmark(wcwidth.wcswidth, text)
 
 
+# NFC vs NFD comparison - text with combining marks
+DIACRITICS_COMPOSED = 'café résumé naïve ' * 100
+DIACRITICS_DECOMPOSED = unicodedata.normalize('NFD', DIACRITICS_COMPOSED)
+
+
+def test_wcswidth_composed(benchmark):
+    """Benchmark wcswidth() with NFC-composed text."""
+    benchmark(wcwidth.wcswidth, DIACRITICS_COMPOSED)
+
+
+def test_wcswidth_decomposed(benchmark):
+    """Benchmark wcswidth() with NFD-decomposed text."""
+    benchmark(wcwidth.wcswidth, DIACRITICS_DECOMPOSED)
+
+
+def test_width_composed(benchmark):
+    """Benchmark width() with NFC-composed text."""
+    benchmark(wcwidth.width, DIACRITICS_COMPOSED)
+
+
+def test_width_decomposed(benchmark):
+    """Benchmark width() with NFD-decomposed text."""
+    benchmark(wcwidth.width, DIACRITICS_DECOMPOSED)
+
+
 def test_width_ascii(benchmark):
     """Benchmark width() with ASCII string."""
     benchmark(wcwidth.width, 'hello world')
@@ -88,6 +114,36 @@ def test_iter_graphemes_combining(benchmark):
     benchmark(lambda: list(wcwidth.iter_graphemes(text)))
 
 
+def test_grapheme_boundary_before_short(benchmark):
+    """Benchmark grapheme_boundary_before() near start of short string."""
+    text = 'Hello 👋🏻!'
+    benchmark(wcwidth.grapheme_boundary_before, text, 8)
+
+
+def test_grapheme_boundary_before_long_end(benchmark):
+    """Benchmark grapheme_boundary_before() near end of long line."""
+    text = 'x' * 95 + '👨\u200d👩\u200d👧!'
+    benchmark(wcwidth.grapheme_boundary_before, text, 100)
+
+
+def test_grapheme_boundary_before_long_mid(benchmark):
+    """Benchmark grapheme_boundary_before() in middle of long line."""
+    text = 'x' * 50 + '👨\u200d👩\u200d👧' + 'y' * 50
+    benchmark(wcwidth.grapheme_boundary_before, text, 55)
+
+
+def test_iter_graphemes_reverse_short(benchmark):
+    """Benchmark iter_graphemes_reverse() with short string."""
+    text = 'café\u0301 🇫🇷!'
+    benchmark(lambda: list(wcwidth.iter_graphemes_reverse(text)))
+
+
+def test_iter_graphemes_reverse_long(benchmark):
+    """Benchmark iter_graphemes_reverse() with long string."""
+    text = 'The quick brown 🦊 jumps over the lazy 🐕. ' * 5
+    benchmark(lambda: list(wcwidth.iter_graphemes_reverse(text)))
+
+
 def test_ljust_ascii(benchmark):
     """Benchmark ljust() with ASCII string."""
     benchmark(wcwidth.ljust, 'hello', 20)
diff --git a/contrib/python/wcwidth/py3/tests/test_grapheme.py b/contrib/python/wcwidth/py3/tests/test_grapheme.py
index f344ad32fde..d2cfa86c1c8 100644
--- a/contrib/python/wcwidth/py3/tests/test_grapheme.py
+++ b/contrib/python/wcwidth/py3/tests/test_grapheme.py
@@ -6,7 +6,7 @@ import os
 import pytest
 
 # local
-from wcwidth import iter_graphemes
+from wcwidth import iter_graphemes, iter_graphemes_reverse, grapheme_boundary_before
 
 try:
     chr(0x2fffe)
@@ -145,3 +145,112 @@ def test_wide_unicode_graphemes(input_str, expected):
 def test_unicode_grapheme_break_test(input_str, expected):
     """Validate against official Unicode GraphemeBreakTest.txt."""
     assert list(iter_graphemes(input_str)) == expected
+
+
+# Prepend: Arabic Number Sign
+PREPEND_CHAR = '\u0600'
+# Multiple combining marks: e + acute + grave
+MULTI_COMBINE = 'e\u0301\u0300'
+
+
+# grapheme_boundary_before(text, pos) returns start of grapheme cluster before pos.
+# (text, pos, expected): pos=search from here, expected=where cluster starts
+[email protected](("text", "pos", "expected"), [
+    # 'abc': 0=a, 1=b, 2=c
+    ('abc', 3, 2),  # from end -> 'c' at 2
+    ('abc', 2, 1),  # from 'c' -> 'b' at 1
+    ('abc', 1, 0),  # from 'b' -> 'a' at 0
+    # 'a\r\nb': CRLF is one cluster (GB3)
+    ('a\r\nb', 3, 1),  # from 'b' -> '\r\n' at 1
+    # 'café': e + combining acute is one cluster (GB9)
+    ('cafe\u0301', 5, 3),  # from end -> 'é' at 3
+    ('cafe\u0301', 4, 3),  # from acute -> still 'é' at 3
+    # Multiple combining marks: e + acute + grave (GB9)
+    ('a' + MULTI_COMBINE + 'b', 4, 1),  # from 'b' -> e+marks at 1
+    # Prepend + char is one cluster (GB9b)
+    (PREPEND_CHAR + 'a', 2, 0),  # whole cluster
+    # Prepend + Control: control breaks (GB4)
+    (PREPEND_CHAR + '\n', 2, 1),  # '\n' separate at 1
+    # C1 control (NEL, 0x85) stops backward scan in _find_cluster_start (GB4)
+    ('X\x85\u0301', 3, 2),
+])
+def test_grapheme_boundary_before_basic(text, pos, expected):
+    """Basic grapheme_boundary_before tests."""
+    assert grapheme_boundary_before(text, pos) == expected
+
+
+[email protected](NARROW_ONLY, reason="requires wide Unicode")
+[email protected](("text", "pos", "expected"), [
+    # 'Hi 👋🏻!': 0=H,1=i,2=space,3=wave,4=skin,5=!; wave+skin is one cluster
+    ('Hi \U0001F44B\U0001F3FB!', 6, 5),  # from end -> '!' at 5
+    ('Hi \U0001F44B\U0001F3FB!', 5, 3),  # from '!' -> wave+skin at 3
+    ('Hi \U0001F44B\U0001F3FB!', 3, 2),  # from wave -> space at 2
+    # 'a🇺🇸b': 0=a,1-2=flag,3=b; flag is one cluster (GB12/13)
+    ('a' + FLAG_US + 'b', 4, 3),  # from end -> 'b' at 3
+    ('a' + FLAG_US + 'b', 3, 1),  # from 'b' -> flag at 1
+    # Three RIs (🇺🇸🇦): flag + solo RI
+    (FLAG_US + RI_A, 3, 2),  # from end -> solo RI at 2
+    (FLAG_US + RI_A, 2, 0),  # from solo -> flag at 0
+    # 'a👨‍👩‍👧b': 0=a,1-5=family,6=b; ZWJ sequence is one cluster (GB11)
+    ('a' + FAMILY + 'b', 7, 6),  # from end -> 'b' at 6
+    ('a' + FAMILY + 'b', 6, 1),  # from 'b' -> family at 1
+])
+def test_grapheme_boundary_before_unicode(text, pos, expected):
+    """grapheme_boundary_before with emoji and wide Unicode."""
+    assert grapheme_boundary_before(text, pos) == expected
+
+
+[email protected](("input_str", "expected"), [
+    ('', []),
+    ('abc', ['c', 'b', 'a']),
+    # café with combining mark mixed with CRLF
+    ('cafe\u0301\r\nok', ['k', 'o', '\r\n', 'e\u0301', 'f', 'a', 'c']),
+])
+def test_iter_graphemes_reverse_basic(input_str, expected):
+    """Basic iter_graphemes_reverse tests."""
+    assert list(iter_graphemes_reverse(input_str)) == expected
+
+
+[email protected](NARROW_ONLY, reason="requires wide Unicode")
+[email protected](("input_str", "expected"), [
+    # Multiple emoji types in one string
+    ('cafe\u0301 ' + WAVE_SKIN + ' ' + FLAG_US + '!',
+     ['!', FLAG_US, ' ', WAVE_SKIN, ' ', 'e\u0301', 'f', 'a', 'c']),
+    # Two families
+    (FAMILY + FAMILY, [FAMILY, FAMILY]),
+    # Flag + solo RI + text
+    ('Hi' + FLAG_US + RI_A + '!', ['!', RI_A, FLAG_US, 'i', 'H']),
+])
+def test_iter_graphemes_reverse_unicode(input_str, expected):
+    """iter_graphemes_reverse with wide Unicode."""
+    assert list(iter_graphemes_reverse(input_str)) == expected
+
+
+[email protected](NARROW_ONLY, reason="requires wide Unicode")
+[email protected](("input_str", "expected"), read_grapheme_break_test())
+def test_grapheme_roundtrip_consistency(input_str, expected):
+    """Forward and reverse iteration produce identical boundaries."""
+    forward = list(iter_graphemes(input_str))
+    reverse = list(iter_graphemes_reverse(input_str))[::-1]
+    assert forward == reverse
+
+
+def test_grapheme_boundary_before_edge_cases():
+    """Edge cases for grapheme_boundary_before."""
+    assert grapheme_boundary_before('abc', 0) == 0
+    assert grapheme_boundary_before('abc', 100) == 2  # pos > len clamps
+    assert grapheme_boundary_before('', 0) == 0
+
+
+def test_iter_graphemes_reverse_edge_cases():
+    """Edge cases for iter_graphemes_reverse."""
+    assert list(iter_graphemes_reverse('abcdef', start=2, end=5)) == ['e', 'd', 'c']
+    assert list(iter_graphemes_reverse('abc', start=0, end=100)) == ['c', 'b', 'a']
+    assert not list(iter_graphemes_reverse('abc', start=5))
+    assert not list(iter_graphemes_reverse('abc', start=2, end=2))
+    # PREPEND + char is one grapheme (GB9b), so start=1 yields nothing (won't split)
+    assert not list(iter_graphemes_reverse(PREPEND_CHAR + 'a', start=1))
+    # But start=0 yields the full grapheme
+    assert list(iter_graphemes_reverse(PREPEND_CHAR + 'a', start=0)) == [PREPEND_CHAR + 'a']
+    # Negative start is clamped to 0
+    assert list(iter_graphemes_reverse('abc', start=-5)) == ['c', 'b', 'a']
diff --git a/contrib/python/wcwidth/py3/tests/test_textwrap.py b/contrib/python/wcwidth/py3/tests/test_textwrap.py
index c2f28bffe1a..fc15f1917f9 100644
--- a/contrib/python/wcwidth/py3/tests/test_textwrap.py
+++ b/contrib/python/wcwidth/py3/tests/test_textwrap.py
@@ -12,6 +12,7 @@ from wcwidth import iter_sequences
 from wcwidth.textwrap import SequenceTextWrapper, wrap
 
 SGR_RED = '\x1b[31m'
+SGR_BLUE = '\x1b[34m'
 SGR_BOLD = '\x1b[1m'
 SGR_RESET = '\x1b[0m'
 ATTRS = ('\x1b[31m', '\x1b[34m', '\x1b[4m', '\x1b[7m', '\x1b[41m', '\x1b[37m', '\x1b[107m')
@@ -203,7 +204,7 @@ SEQUENCE_CASES = [
     # Empty/adjacent sequences
     (f'{SGR_RED}{SGR_RESET}', 10, [f'{SGR_RED}{SGR_RESET}']),
     (f'hello {SGR_RED}{SGR_RESET}world', 6, ['hello', f'{SGR_RED}{SGR_RESET}world']),
-    # OSC hyperlinks
+    # OSC hyperlinks (with space separator)
     (f'{OSC_HYPERLINK} text', 5, [OSC_HYPERLINK, 'text']),
     # CSI cursor sequences
     (f'{CSI_CURSOR}text here', 10, [f'{CSI_CURSOR}text', 'here']),
@@ -262,3 +263,69 @@ TABSIZE_WIDE_CASES = [
 def test_wrap_tabsize_wide_chars(text, w, tabsize, expected):
     """Verify tabsize respects wide character column positions."""
     assert wrap(text, w, tabsize=tabsize) == expected
+
+
+OSC_START_ST = '\x1b]8;;http://example.com\x1b\\'
+OSC_END_ST = '\x1b]8;;\x1b\\'
+OSC_START_BEL = '\x1b]8;;http://example.com\x07'
+OSC_END_BEL = '\x1b]8;;\x07'
+
+HYPERLINK_WORD_BOUNDARY_CASES = [
+    (   # standard, ST-variant,
+        f'{OSC_START_ST}link{OSC_END_ST}more',
+        5,
+        [f'{OSC_START_ST}link{OSC_END_ST}', 'more'],
+    ),
+    (   # BEL-variant,
+        f'{OSC_START_BEL}link{OSC_END_BEL}more',
+        5,
+        [f'{OSC_START_BEL}link{OSC_END_BEL}', 'more'],
+    ),
+    (   # hyperlink breaks after word, 'prefix',
+        f'prefix{OSC_START_ST}link{OSC_END_ST}',
+        6,
+        ['prefix', f'{OSC_START_ST}link{OSC_END_ST}'],
+    ),
+    (
+        f'prefix{OSC_START_BEL}link{OSC_END_BEL}',
+        6,
+        ['prefix', f'{OSC_START_BEL}link{OSC_END_BEL}'],
+    ),
+    (   # hyperlink breaks before following, 'suffix',
+        f'prefix{OSC_START_ST}link{OSC_END_ST}suffix',
+        6,
+        ['prefix', f'{OSC_START_ST}link{OSC_END_ST}', 'suffix'],
+    ),
+    (
+        f'prefix{OSC_START_BEL}link{OSC_END_BEL}suffix',
+        6,
+        ['prefix', f'{OSC_START_BEL}link{OSC_END_BEL}', 'suffix'],
+    ),
+    (   # hyperlink *surrounded* by SGR attributes
+        f'foo {SGR_RED}{OSC_START_ST}link{OSC_END_ST}{SGR_RESET} bar',
+        6,
+        ['foo', f'{SGR_RED}{OSC_START_ST}link{OSC_END_ST}{SGR_RESET}', 'bar'],
+    ),
+    (
+        f'foo {SGR_RED}{OSC_START_BEL}link{OSC_END_BEL}{SGR_RESET} bar',
+        6,
+        ['foo', f'{SGR_RED}{OSC_START_BEL}link{OSC_END_BEL}{SGR_RESET}', 'bar'],
+    ),
+    (   # hyperlink *containing* SGR attributes
+        f'foo {OSC_START_ST}{SGR_RED}link{SGR_RESET}{OSC_END_ST} bar',
+        6,
+        ['foo', f'{OSC_START_ST}{SGR_RED}link{SGR_RESET}{OSC_END_ST}', 'bar'],
+    ),
+    (
+        f'foo {OSC_START_BEL}{SGR_RED}link{SGR_RESET}{OSC_END_BEL} bar',
+        6,
+        ['foo', f'{OSC_START_BEL}{SGR_RED}link{SGR_RESET}{OSC_END_BEL}', 'bar'],
+    ),
+]
+
+
+[email protected]('text,w,expected', HYPERLINK_WORD_BOUNDARY_CASES)
+def test_wrap_hyperlink_word_boundary(text, w, expected):
+    """OSC hyperlink sequences should act as word boundaries."""
+    result = wrap(text, w)
+    assert result == expected
author	robot-piglet <[email protected]>	2026-02-11 10:55:54 +0300
committer	robot-piglet <[email protected]>	2026-02-11 11:24:08 +0300
commit	9b5f29efa00bba424cd32471a95ececc583fe046 (patch)
tree	58936c6fc2147c49fc2a4aec657f63fb6f789336 /contrib/python/wcwidth/py3/tests
parent	df75a44af0e3c0cfce907e22f61d6c91fc3bbc39 (diff)