"""Tests for grapheme cluster segmentation."""

# std imports
import os

# 3rd party
import pytest

# local
from wcwidth import iter_graphemes, iter_graphemes_reverse, grapheme_boundary_before

try:
    chr(0x2fffe)
    NARROW_ONLY = False
except ValueError:
    NARROW_ONLY = True


def parse_grapheme_break_test_line(line):
    """Parse a line from GraphemeBreakTest.txt."""
    data, _, _ = line.partition('#')
    data = data.strip()
    if not data:
        return None, None

    parts = []
    current_cluster = []

    for token in data.split():
        if token == '÷':
            if current_cluster:
                parts.append(current_cluster)
                current_cluster = []
        elif token == '×':
            pass
        else:
            try:
                current_cluster.append(int(token, 16))
            except ValueError:
                continue

    if current_cluster:
        parts.append(current_cluster)

    all_codepoints = []
    expected_clusters = []
    for cluster in parts:
        cluster_str = ''.join(chr(cp) for cp in cluster)
        expected_clusters.append(cluster_str)
        all_codepoints.extend(cluster)

    if not all_codepoints:
        return None, None

    input_str = ''.join(chr(cp) for cp in all_codepoints)
    return input_str, expected_clusters


def read_grapheme_break_test():
    """Read and parse GraphemeBreakTest.txt."""
    import yatest.common as yc
    test_file = os.path.join(os.path.dirname(yc.source_path(__file__)), 'GraphemeBreakTest.txt')
    if not os.path.exists(test_file):
        return []

    test_cases = []
    with open(test_file, encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            input_str, expected = parse_grapheme_break_test_line(line)
            if input_str is not None:
                test_cases.append(pytest.param(input_str, expected, id=f"line{line_num}"))

    return test_cases


@pytest.mark.parametrize(("input_str", "expected"), [
    ('', []),
    ('a', ['a']),
    ('abc', ['a', 'b', 'c']),
    ('cafe\u0301', ['c', 'a', 'f', 'e\u0301']),
    ('\r\n', ['\r\n']),
    ('ok\r\nok', ['o', 'k', '\r\n', 'o', 'k']),
    ('\r', ['\r']),
    ('ok\rok', ['o', 'k', '\r', 'o', 'k']),
    ('\n', ['\n']),
    ('ok\nok', ['o', 'k', '\n', 'o', 'k']),
    ('\r\r', ['\r', '\r']),
    ('ok\r\rok', ['o', 'k', '\r', '\r', 'o', 'k']),
])
def test_core_grapheme(input_str, expected):
    """Basic grapheme cluster segmentation."""
    assert list(iter_graphemes(input_str)) == expected


@pytest.mark.parametrize(("input_str", "start", "end", "expected"), [
    ('abcdef', 2, None, ['c', 'd', 'e', 'f']),
    ('abcdef', 0, 4, ['a', 'b', 'c', 'd']),
    ('abcdef', 1, 4, ['b', 'c', 'd']),
    ('abc', 10, None, []),
    ('abc', 0, 10, ['a', 'b', 'c']),
])
def test_iter_graphemes_slice(input_str, start, end, expected):
    """Grapheme iteration with start/end parameters."""
    assert list(iter_graphemes(input_str, start=start, end=end)) == expected


HANGUL_LV = '\u1100\u1161'
HANGUL_LVT = '\uAC00\u11A8'
FLAG_US = '\U0001F1FA\U0001F1F8'
FLAG_AU = '\U0001F1E6\U0001F1FA'
RI_A = '\U0001F1E6'
FAMILY = '\U0001F468\u200D\U0001F469\u200D\U0001F467'
WAVE_SKIN = '\U0001F44B\U0001F3FB'
HEART_EMOJI = '\u2764\uFE0F'


@pytest.mark.skipif(NARROW_ONLY, reason="requires wide Unicode")
@pytest.mark.parametrize(("input_str", "expected"), [
    (HANGUL_LV, [HANGUL_LV]),
    ('ok' + HANGUL_LV + 'ok', ['o', 'k', HANGUL_LV, 'o', 'k']),
    (HANGUL_LVT, [HANGUL_LVT]),
    ('ok' + HANGUL_LVT + 'ok', ['o', 'k', HANGUL_LVT, 'o', 'k']),
    (FLAG_US, [FLAG_US]),
    ('ok' + FLAG_US + 'ok', ['o', 'k', FLAG_US, 'o', 'k']),
    (FLAG_US + RI_A, [FLAG_US, RI_A]),
    ('ok' + FLAG_US + RI_A + 'ok', ['o', 'k', FLAG_US, RI_A, 'o', 'k']),
    (FLAG_US + FLAG_AU, [FLAG_US, FLAG_AU]),
    ('ok' + FLAG_US + FLAG_AU + 'ok', ['o', 'k', FLAG_US, FLAG_AU, 'o', 'k']),
    (FAMILY, [FAMILY]),
    ('ok' + FAMILY + 'ok', ['o', 'k', FAMILY, 'o', 'k']),
    (WAVE_SKIN, [WAVE_SKIN]),
    ('ok' + WAVE_SKIN + 'ok', ['o', 'k', WAVE_SKIN, 'o', 'k']),
    (HEART_EMOJI, [HEART_EMOJI]),
    ('ok' + HEART_EMOJI + 'ok', ['o', 'k', HEART_EMOJI, 'o', 'k']),
])
def test_wide_unicode_graphemes(input_str, expected):
    """Grapheme segmentation for wide Unicode characters."""
    assert list(iter_graphemes(input_str)) == expected


@pytest.mark.skipif(NARROW_ONLY, reason="requires wide Unicode")
@pytest.mark.skipif(not os.path.exists(os.path.join(os.path.dirname(__file__), 'GraphemeBreakTest.txt')),
                    reason="GraphemeBreakTest.txt is missing; run bin/update-tables.py")
@pytest.mark.parametrize(("input_str", "expected"), read_grapheme_break_test())
def test_unicode_grapheme_break_test(input_str, expected):
    """Validate against official Unicode GraphemeBreakTest.txt."""
    assert list(iter_graphemes(input_str)) == expected


# Prepend: Arabic Number Sign
PREPEND_CHAR = '\u0600'
# Multiple combining marks: e + acute + grave
MULTI_COMBINE = 'e\u0301\u0300'


# grapheme_boundary_before(text, pos) returns start of grapheme cluster before pos.
# (text, pos, expected): pos=search from here, expected=where cluster starts
@pytest.mark.parametrize(("text", "pos", "expected"), [
    # 'abc': 0=a, 1=b, 2=c
    ('abc', 3, 2),  # from end -> 'c' at 2
    ('abc', 2, 1),  # from 'c' -> 'b' at 1
    ('abc', 1, 0),  # from 'b' -> 'a' at 0
    # 'a\r\nb': CRLF is one cluster (GB3)
    ('a\r\nb', 3, 1),  # from 'b' -> '\r\n' at 1
    # 'café': e + combining acute is one cluster (GB9)
    ('cafe\u0301', 5, 3),  # from end -> 'é' at 3
    ('cafe\u0301', 4, 3),  # from acute -> still 'é' at 3
    # Multiple combining marks: e + acute + grave (GB9)
    ('a' + MULTI_COMBINE + 'b', 4, 1),  # from 'b' -> e+marks at 1
    # Prepend + char is one cluster (GB9b)
    (PREPEND_CHAR + 'a', 2, 0),  # whole cluster
    # Prepend + Control: control breaks (GB4)
    (PREPEND_CHAR + '\n', 2, 1),  # '\n' separate at 1
    # C1 control (NEL, 0x85) stops backward scan in _find_cluster_start (GB4)
    ('X\x85\u0301', 3, 2),
])
def test_grapheme_boundary_before_basic(text, pos, expected):
    """Basic grapheme_boundary_before tests."""
    assert grapheme_boundary_before(text, pos) == expected


@pytest.mark.skipif(NARROW_ONLY, reason="requires wide Unicode")
@pytest.mark.parametrize(("text", "pos", "expected"), [
    # 'Hi 👋🏻!': 0=H,1=i,2=space,3=wave,4=skin,5=!; wave+skin is one cluster
    ('Hi \U0001F44B\U0001F3FB!', 6, 5),  # from end -> '!' at 5
    ('Hi \U0001F44B\U0001F3FB!', 5, 3),  # from '!' -> wave+skin at 3
    ('Hi \U0001F44B\U0001F3FB!', 3, 2),  # from wave -> space at 2
    # 'a🇺🇸b': 0=a,1-2=flag,3=b; flag is one cluster (GB12/13)
    ('a' + FLAG_US + 'b', 4, 3),  # from end -> 'b' at 3
    ('a' + FLAG_US + 'b', 3, 1),  # from 'b' -> flag at 1
    # Three RIs (🇺🇸🇦): flag + solo RI
    (FLAG_US + RI_A, 3, 2),  # from end -> solo RI at 2
    (FLAG_US + RI_A, 2, 0),  # from solo -> flag at 0
    # 'a👨‍👩‍👧b': 0=a,1-5=family,6=b; ZWJ sequence is one cluster (GB11)
    ('a' + FAMILY + 'b', 7, 6),  # from end -> 'b' at 6
    ('a' + FAMILY + 'b', 6, 1),  # from 'b' -> family at 1
])
def test_grapheme_boundary_before_unicode(text, pos, expected):
    """grapheme_boundary_before with emoji and wide Unicode."""
    assert grapheme_boundary_before(text, pos) == expected


@pytest.mark.parametrize(("input_str", "expected"), [
    ('', []),
    ('abc', ['c', 'b', 'a']),
    # café with combining mark mixed with CRLF
    ('cafe\u0301\r\nok', ['k', 'o', '\r\n', 'e\u0301', 'f', 'a', 'c']),
])
def test_iter_graphemes_reverse_basic(input_str, expected):
    """Basic iter_graphemes_reverse tests."""
    assert list(iter_graphemes_reverse(input_str)) == expected


@pytest.mark.skipif(NARROW_ONLY, reason="requires wide Unicode")
@pytest.mark.parametrize(("input_str", "expected"), [
    # Multiple emoji types in one string
    ('cafe\u0301 ' + WAVE_SKIN + ' ' + FLAG_US + '!',
     ['!', FLAG_US, ' ', WAVE_SKIN, ' ', 'e\u0301', 'f', 'a', 'c']),
    # Two families
    (FAMILY + FAMILY, [FAMILY, FAMILY]),
    # Flag + solo RI + text
    ('Hi' + FLAG_US + RI_A + '!', ['!', RI_A, FLAG_US, 'i', 'H']),
])
def test_iter_graphemes_reverse_unicode(input_str, expected):
    """iter_graphemes_reverse with wide Unicode."""
    assert list(iter_graphemes_reverse(input_str)) == expected


@pytest.mark.skipif(NARROW_ONLY, reason="requires wide Unicode")
@pytest.mark.skipif(not os.path.exists(os.path.join(os.path.dirname(__file__), 'GraphemeBreakTest.txt')),
                    reason="GraphemeBreakTest.txt is missing; run bin/update-tables.py")
@pytest.mark.parametrize(("input_str", "expected"), read_grapheme_break_test())
def test_grapheme_roundtrip_consistency(input_str, expected):
    """Forward and reverse iteration produce identical boundaries."""
    forward = list(iter_graphemes(input_str))
    reverse = list(iter_graphemes_reverse(input_str))[::-1]
    assert forward == reverse


def test_grapheme_boundary_before_edge_cases():
    """Edge cases for grapheme_boundary_before."""
    assert grapheme_boundary_before('abc', 0) == 0
    assert grapheme_boundary_before('abc', 100) == 2  # pos > len clamps
    assert grapheme_boundary_before('', 0) == 0


def test_iter_graphemes_reverse_edge_cases():
    """Edge cases for iter_graphemes_reverse."""
    assert list(iter_graphemes_reverse('abcdef', start=2, end=5)) == ['e', 'd', 'c']
    assert list(iter_graphemes_reverse('abc', start=0, end=100)) == ['c', 'b', 'a']
    assert not list(iter_graphemes_reverse('abc', start=5))
    assert not list(iter_graphemes_reverse('abc', start=2, end=2))
    # PREPEND + char is one grapheme (GB9b), so start=1 yields nothing (won't split)
    assert not list(iter_graphemes_reverse(PREPEND_CHAR + 'a', start=1))
    # But start=0 yields the full grapheme
    assert list(iter_graphemes_reverse(PREPEND_CHAR + 'a', start=0)) == [PREPEND_CHAR + 'a']
    # Negative start is clamped to 0
    assert list(iter_graphemes_reverse('abc', start=-5)) == ['c', 'b', 'a']