diff options
author | armenqa <armenqa@yandex-team.com> | 2024-01-19 12:23:50 +0300 |
---|---|---|
committer | armenqa <armenqa@yandex-team.com> | 2024-01-19 13:10:03 +0300 |
commit | 2de0149d0151c514b22bca0760b95b26c9b0b578 (patch) | |
tree | 2bfed9f3bce7e643ddf048bb61ce3dc0a714bcc2 /library/python/strings | |
parent | a8c06d218f12b2406fbce24d194885c5d7b68503 (diff) | |
download | ydb-2de0149d0151c514b22bca0760b95b26c9b0b578.tar.gz |
feat contrib: aiogram 3
Relates: https://st.yandex-team.ru/, https://st.yandex-team.ru/
Diffstat (limited to 'library/python/strings')
-rw-r--r-- | library/python/strings/__init__.py | 3 | ||||
-rw-r--r-- | library/python/strings/strings.py | 117 | ||||
-rw-r--r-- | library/python/strings/ut/test_strings.py | 150 |
3 files changed, 270 insertions, 0 deletions
diff --git a/library/python/strings/__init__.py b/library/python/strings/__init__.py index ae27ddbdae..cd1084b0f0 100644 --- a/library/python/strings/__init__.py +++ b/library/python/strings/__init__.py @@ -12,10 +12,13 @@ from .strings import ( guess_default_encoding, left_strip, locale_encoding, + parse_qs_binary, + parse_qsl_binary, stringize_deep, to_basestring, to_str, to_unicode, truncate, unicodize_deep, + unquote_binary, ) diff --git a/library/python/strings/strings.py b/library/python/strings/strings.py index f5fa2d32c6..d068f30b76 100644 --- a/library/python/strings/strings.py +++ b/library/python/strings/strings.py @@ -176,3 +176,120 @@ def fix_utf8(data): # remove destroyed symbol code udata = six.ensure_text(data, 'utf-8', 'ignore') return six.ensure_str(udata, 'utf-8', errors='ignore') + + +_hexdig = "0123456789ABCDEFabcdef" +_hextobyte = { + (a + b).encode(): bytes.fromhex(a + b) if six.PY3 else (a + b).decode("hex") for a in _hexdig for b in _hexdig +} + + +def parse_qs_binary(qs, keep_blank_values=False, strict_parsing=False, max_num_fields=None, separator=b'&'): + """Parse a query like original `parse_qs` from `urlparse`, `urllib.parse`, but query given as a bytes argument. + + Arguments: + + qs: percent-encoded query string to be parsed + + keep_blank_values: flag indicating whether blank values in + percent-encoded queries should be treated as blank byte strings. + A true value indicates that blanks should be retained as + blank byte strings. The default false value indicates that + blank values are to be ignored and treated as if they were + not included. + + strict_parsing: flag indicating what to do with parsing errors. + If false (the default), errors are silently ignored. + If true, errors raise a ValueError exception. + + max_num_fields: int. If set, then throws a ValueError if there + are more than n fields read by parse_qsl_binary(). + + separator: bytes. The symbol to use for separating the query arguments. + Defaults to &. + + Returns a dictionary. + """ + parsed_result = {} + pairs = parse_qsl_binary(qs, keep_blank_values, strict_parsing, max_num_fields=max_num_fields, separator=separator) + for name, value in pairs: + if name in parsed_result: + parsed_result[name].append(value) + else: + parsed_result[name] = [value] + return parsed_result + + +def parse_qsl_binary(qs, keep_blank_values=False, strict_parsing=False, max_num_fields=None, separator=b'&'): + """Parse a query like original `parse_qs` from `urlparse`, `urllib.parse`, but query given as a bytes argument. + + Arguments: + + qs: percent-encoded query bytes to be parsed + + keep_blank_values: flag indicating whether blank values in + percent-encoded queries should be treated as blank byte strings. + A true value indicates that blanks should be retained as blank + byte strings. The default false value indicates that blank values + are to be ignored and treated as if they were not included. + + strict_parsing: flag indicating what to do with parsing errors. If + false (the default), errors are silently ignored. If true, + errors raise a ValueError exception. + + max_num_fields: int. If set, then throws a ValueError + if there are more than n fields read by parse_qsl_binary(). + + separator: bytes. The symbol to use for separating the query arguments. + Defaults to &. + + Returns a list. + """ + + if max_num_fields is not None: + num_fields = 1 + qs.count(separator) if qs else 0 + if max_num_fields < num_fields: + raise ValueError('Max number of fields exceeded') + + r = [] + query_args = qs.split(separator) if qs else [] + for name_value in query_args: + if not name_value and not strict_parsing: + continue + nv = name_value.split(b'=', 1) + + if len(nv) != 2: + if strict_parsing: + raise ValueError("bad query field: %r" % (name_value,)) + # Handle case of a control-name with no equal sign + if keep_blank_values: + nv.append(b'') + else: + continue + if len(nv[1]) or keep_blank_values: + name = nv[0].replace(b'+', b' ') + name = unquote_binary(name) + value = nv[1].replace(b'+', b' ') + value = unquote_binary(value) + r.append((name, value)) + return r + + +def unquote_binary(string): + """Replace %xx escapes by their single-character equivalent. + By default, percent-encoded sequences are replaced by ASCII character or + byte code, and invalid sequences are replaced by a placeholder character. + + unquote('abc%20def') -> 'abc def' + unquote('abc%FFdef') -> 'abc\xffdef' + unquote('%no') -> '%no' + """ + bits = string.split(b"%") + if len(bits) == 1: + return bits[0] + + res = [bits[0]] + for item in bits[1:]: + res.append(_hextobyte.get(item[:2], b"%")) + res.append(item if res[-1] == b"%" else item[2:]) + return b"".join(res) diff --git a/library/python/strings/ut/test_strings.py b/library/python/strings/ut/test_strings.py index 6177c10b25..d2bfe6ed8b 100644 --- a/library/python/strings/ut/test_strings.py +++ b/library/python/strings/ut/test_strings.py @@ -5,6 +5,11 @@ import six from library.python import strings +if six.PY3: + from urllib.parse import parse_qs, parse_qsl, unquote +else: + from urlparse import parse_qs, parse_qsl, unquote + class Convertible(object): text = u'текст' @@ -272,3 +277,148 @@ def test_truncate_utf_8_text_wrong_limit(): with pytest.raises(AssertionError): strings.truncate("hello", 4, msg="long msg") + + +@pytest.mark.parametrize( + "given,expected", + [ + ( + b"a=a", + [(b"a", b"a")], + ), + ( + b"a=a&a=b", + [(b"a", b"a"), (b"a", b"b")], + ), + ( + b"a=a+&b=b++", + [(b"a", b"a "), (b"b", b"b ")], + ), + ( + b"a=a&&b=b", + [(b"a", b"a"), (b"b", b"b")], + ), + ( + b"a=a&b=%%3C%2Fscript%3E", + [(b"a", b"a"), (b"b", b"%</script>")], + ), + ( + b"clid=%EF%BB%BF123", + [(b"clid", b"\xef\xbb\xbf123")], + ), + ], +) +def test_parse_qsl(given, expected): + assert strings.parse_qsl_binary(given) == expected + + +@pytest.mark.parametrize( + "given,expected,keep_blank_values", + [ + (b"a=", {}, False), + (b"a=", {b"a": [b""]}, True), + (b"a", {}, False), + (b"a", {b"a": [b""]}, True), + (b"a=a&a=b", {b"a": [b"a", b"b"]}, False), + ], +) +def test_parse_qs_with_keep_blank_values(given, expected, keep_blank_values): + assert strings.parse_qs_binary(given, keep_blank_values=keep_blank_values) == expected + + +@pytest.mark.parametrize( + "given,strict_parsing", + [(b"a", True)], +) +def test_parse_qs_with_strict_parsing(given, strict_parsing): + with pytest.raises(ValueError, match="bad query field.*"): + strings.parse_qs_binary(given, strict_parsing=strict_parsing) + + with pytest.raises(ValueError, match="bad query field.*"): + parse_qs(given, strict_parsing=strict_parsing) + + +@pytest.mark.parametrize( + "given,max_num_fields", + [(b"a=a&b=bb&c=c", 2)], +) +def test_parse_qs_with_max_num_fields(given, max_num_fields): + with pytest.raises(ValueError, match="Max number of fields exceeded"): + strings.parse_qs_binary(given, max_num_fields=max_num_fields) + + with pytest.raises(ValueError, match="Max number of fields exceeded"): + parse_qs(given, max_num_fields=max_num_fields) + + +@pytest.mark.parametrize( + "given,expected", + [ + ( + b"", + b"", + ), + ( + b"without percent", + b"without percent", + ), + ( + b"%61 and %62", + b"a and b", + ), + ( + b"%FF can't %unparse char%", + b"\xff can't %unparse char%", + ), + ], +) +def test_unquote(given, expected): + assert strings.unquote_binary(given) == expected + + +URL_PARAMS = [ + (b"a=", False, False, None), + (b"a=a&a=b", False, False, None), + (b"a=a&a=b&b=b", False, False, None), + (b"a=a&&b=b", False, False, None), + (b"a=a&b=%%3C%2Fscript%3E", False, False, None), + (b"a=", True, False, None), + (b"a", False, False, None), + (b"a", True, False, None), +] + + +@pytest.mark.parametrize( + "string,keep_blank_values,strict_parsing,max_num_fields", + URL_PARAMS if six.PY3 else URL_PARAMS + [(b"clid=%EF%BB%BF123", False, False, None)], +) +def test_parse_qs_compatibility(string, keep_blank_values, strict_parsing, max_num_fields): + for string_method, urlparse_method in (strings.parse_qsl_binary, parse_qsl), (strings.parse_qs_binary, parse_qs): + string_res = string_method( + string, + keep_blank_values=keep_blank_values, + strict_parsing=strict_parsing, + max_num_fields=max_num_fields, + ) + urlparse_res = urlparse_method( + string, + keep_blank_values=keep_blank_values, + strict_parsing=strict_parsing, + max_num_fields=max_num_fields, + ) + assert string_res == urlparse_res + + +@pytest.mark.parametrize( + "string", + [ + (b""), + (b"without percent"), + (b"a and b"), + ((b"%FF " if six.PY2 else b"") + b"can't %unparse char%"), + ], +) +def test_unquote_compatibility(string): + unquote_res = unquote(string) + if six.PY3: + unquote_res = six.ensure_binary(unquote_res) + assert strings.unquote_binary(string) == unquote_res |