diff options
| author | uzhas <[email protected]> | 2023-11-18 13:15:37 +0300 |
|---|---|---|
| committer | uzhas <[email protected]> | 2023-11-18 13:33:08 +0300 |
| commit | 2c2b40cc0e738d6a07d3f713e50f5dc660ef7426 (patch) | |
| tree | 3f904d3b0409e924810fd80d4f8c30bdd0f9e089 /yt/python | |
| parent | e5f17f4337ead20a6fd1f4948e3ebe4e75c5b7f6 (diff) | |
to fix build in github
Diffstat (limited to 'yt/python')
32 files changed, 4140 insertions, 0 deletions
diff --git a/yt/python/yt/__init__.py b/yt/python/yt/__init__.py new file mode 100644 index 00000000000..b7cffee5e3c --- /dev/null +++ b/yt/python/yt/__init__.py @@ -0,0 +1,6 @@ +"""Python interfaces for YT system. + +YT is reliable and performant distributed storage and computation platform. +There are Python interfaces for low level YT client API: binding to C++ Driver API, wrapper for HTTP-interface and +some pretty "binaries" for interactive usage in command line. +""" diff --git a/yt/python/yt/common.py b/yt/python/yt/common.py new file mode 100644 index 00000000000..b2ee8fb8db0 --- /dev/null +++ b/yt/python/yt/common.py @@ -0,0 +1,846 @@ +try: + from yt.packages.six import iteritems, PY3, text_type, binary_type, string_types + from yt.packages.six.moves import map as imap +except ImportError: + from six import iteritems, PY3, text_type, binary_type, string_types + from six.moves import map as imap + +import yt.json_wrapper as json + +try: + from library.python.prctl import prctl +except ImportError: + prctl = None + +# Fix for thread unsafety of datetime module. +# See http://bugs.python.org/issue7980 for more details. +import _strptime # noqa + +# Python3 compatibility +try: + from collections.abc import Mapping +except ImportError: + from collections import Mapping +from datetime import datetime, timedelta +from itertools import chain +from functools import wraps + +import calendar +import copy +import ctypes +import errno +import functools +import inspect +import os +import re +import signal +import socket +import sys +import time +import types +import string +import warnings + +# Standard YT time representation + +YT_DATETIME_FORMAT_STRING = "%Y-%m-%dT%H:%M:%S.%fZ" + +YT_NULL_TRANSACTION_ID = "0-0-0-0" + + +# Deprecation stuff. +class YtDeprecationWarning(DeprecationWarning): + """Custom warnings category, because built-in category is ignored by default.""" + + +warnings.simplefilter("default", category=YtDeprecationWarning) + +DEFAULT_DEPRECATION_MESSAGE = "{0} is deprecated and will be removed in the next major release, " \ + "use {1} instead" + +ERROR_TEXT_MATCHING_DEPRECATION_MESSAGE = "Matching errors by their messages using string patterns is highly " \ + "discouraged. It is recommended to use contains_code(code) method instead. " \ + "If there is no suitable error code for your needs, ask yt@ for creating one." + + +def declare_deprecated(functional_name, alternative_name, condition=None, message=None): + if condition or condition is None: + message = get_value(message, DEFAULT_DEPRECATION_MESSAGE.format(functional_name, alternative_name)) + warnings.warn(message, YtDeprecationWarning) + + +def deprecated_with_message(message): + def function_decorator(func): + @wraps(func) + def deprecated_function(*args, **kwargs): + warnings.warn(message, YtDeprecationWarning) + return func(*args, **kwargs) + return deprecated_function + return function_decorator + + +def deprecated(alternative): + def function_decorator(func): + warn_message = DEFAULT_DEPRECATION_MESSAGE.format(func.__name__, alternative) + return deprecated_with_message(warn_message)(func) + return function_decorator + + +def get_fqdn(): + fqdn = socket.getfqdn() + if fqdn == "localhost.localdomain": + fqdn = "localhost" + return fqdn + + +class YtError(Exception): + """Base class for all YT errors.""" + def __init__(self, message="", code=1, inner_errors=None, attributes=None): + self.message = message + self.code = code + self.inner_errors = inner_errors if inner_errors is not None else [] + self.attributes = attributes if attributes else {} + if "host" not in self.attributes: + self.attributes["host"] = self._get_fqdn() + if "datetime" not in self.attributes: + self.attributes["datetime"] = datetime_to_string(datetime.utcnow()) + + def simplify(self): + """Transforms error (with inner errors) to standard python dict.""" + result = {"message": self.message, "code": self.code} + if self.attributes: + result["attributes"] = self.attributes + if self.inner_errors: + result["inner_errors"] = [] + for error in self.inner_errors: + result["inner_errors"].append( + error.simplify() if isinstance(error, YtError) else + error) + return result + + @classmethod + def from_dict(cls, dict_): + """Restores YtError instance from standard python dict. Reverses simplify().""" + inner_errors = [cls.from_dict(inner) for inner in dict_.get("inner_errors", [])] + return cls(message=dict_["message"], code=dict_["code"], attributes=dict_.get("attributes"), + inner_errors=inner_errors) + + def find_matching_error(self, code=None, predicate=None): + """ + Find a suberror contained in the error (possibly the error itself) which is either: + - having error code equal to `code'; + - or satisfying custom predicate `predicate'. + + Exactly one condition should be specified. + + Returns either first error matching the condition or None if no matching found. + """ + + if sum(argument is not None for argument in (code, predicate)) != 1: + raise ValueError("Exactly one condition should be specified") + + if code is not None: + predicate = lambda error: int(error.code) == code # noqa + + def find_recursive(error): + # error may be Python dict; if so, transform it to YtError. + if not isinstance(error, YtError): + error = YtError(**error) + + if predicate(error): + return error + for inner_error in error.inner_errors: + inner_result = find_recursive(inner_error) + if inner_result: + return inner_result + return None + + return find_recursive(self) + + def contains_code(self, code): + """Check if error or one of its inner errors contains specified error code.""" + return self.find_matching_error(code=code) is not None + + def _contains_text(self, text): + """Inner method, do not call explicitly.""" + return self.find_matching_error(predicate=lambda error: text in error.message) is not None + + @deprecated_with_message(ERROR_TEXT_MATCHING_DEPRECATION_MESSAGE) + def contains_text(self, text): + """ + Check if error or one of its inner errors contains specified substring in message. + + It is not recommended to use this helper; consider using contains_code instead. + If the error you are seeking is not distinguishable by code, please send a message to yt@ + and we will fix that. + """ + + return self._contains_text(text) + + def _matches_regexp(self, pattern): + """Inner method, do not call explicitly.""" + return self.find_matching_error(predicate=lambda error: re.match(pattern, error.message) is not None) is not None + + @deprecated_with_message(ERROR_TEXT_MATCHING_DEPRECATION_MESSAGE) + def matches_regexp(self, pattern): + """ + Check if error message or one of its inner error messages matches given regexp. + + It is not recommended to use this helper; consider using contains_code instead. + If the error you are seeking is not distinguishable by code, please send a message to yt@ + and we will fix that. + """ + + return self._matches_regexp(pattern) + + def __str__(self): + return format_error(self) + + def __repr__(self): + return "%s(%s)" % ( + self.__class__.__name__, + _pretty_format_messages_flat(self)) + + @staticmethod + def _get_fqdn(): + if not hasattr(YtError, "_cached_fqdn"): + YtError._cached_fqdn = get_fqdn() + return YtError._cached_fqdn + + # Error differentiation methods. + def is_retriable_archive_error(self): + """ + Operation progress in Cypress is outdated or some attributes are archive only + while archive request failed + """ + return self.contains_code(1911) + + def is_resolve_error(self): + """Resolution error.""" + return self.contains_code(500) + + def is_already_exists(self): + """Already exists.""" + return self.contains_code(501) + + def is_access_denied(self): + """Access denied.""" + return self.contains_code(901) + + def is_account_limit_exceeded(self): + """Access denied.""" + return self.contains_code(902) + + def is_concurrent_transaction_lock_conflict(self): + """Deprecated! Transaction lock conflict.""" + return self.contains_code(402) + + def is_cypress_transaction_lock_conflict(self): + """Transaction lock conflict.""" + return self.contains_code(402) + + def is_tablet_transaction_lock_conflict(self): + """Transaction lock conflict.""" + return self.contains_code(1700) + + @deprecated(alternative='use is_request_queue_size_limit_exceeded') + def is_request_rate_limit_exceeded(self): + """Request rate limit exceeded.""" + return self.contains_code(904) + + def is_safe_mode_enabled(self): + """Safe mode enabled.""" + return self.contains_code(906) + + def is_request_queue_size_limit_exceeded(self): + """Request rate limit exceeded.""" + return self.contains_code(108) or self.contains_code(904) + + def is_rpc_unavailable(self): + """Rpc unavailable.""" + return self.contains_code(105) + + def is_master_communication_error(self): + """Master communication error.""" + return self.contains_code(712) + + def is_chunk_unavailable(self): + """Chunk unavailable.""" + return self.contains_code(716) + + def is_request_timed_out(self): + """Request timed out.""" + return self.contains_code(3) + + def is_concurrent_operations_limit_reached(self): + """Too many concurrent operations.""" + return self.contains_code(202) + + def is_master_disconnected(self): + """Master disconnected error.""" + return self.contains_code(218) + + def is_no_such_transaction(self): + """No such transaction.""" + return self.contains_code(11000) + + def is_no_such_job(self): + """No such job.""" + return self.contains_code(203) + + def is_no_such_operation(self): + """No such operation.""" + return self.contains_code(1915) + + def is_shell_exited(self): + """Shell exited.""" + return self.contains_code(1800) or self.contains_code(1801) + + def is_no_such_service(self): + """No such service.""" + return self.contains_code(102) + + def is_transport_error(self): + """Transport error.""" + return self.contains_code(100) + + def is_tablet_in_intermediate_state(self): + """Tablet is in intermediate state.""" + # TODO(ifsmirnov) migrate to error code, YT-10993 + return self._matches_regexp("Tablet .* is in state .*") + + def is_no_such_tablet(self): + """No such tablet.""" + return self.contains_code(1701) + + def is_tablet_not_mounted(self): + """Tablet is not mounted.""" + return self.contains_code(1702) + + def is_no_such_cell(self): + """No such cell.""" + return self.contains_code(1721) + + def is_all_target_nodes_failed(self): + """Failed to write chunk since all target nodes have failed.""" + return self.contains_code(700) + + def is_no_such_attribute(self, attributes_list=None): + """Operation attribute is not supported.""" + if attributes_list is None: + pred_new = lambda err: err.code == 1920 # noqa + else: + pred_new = lambda err: (err.attributes.get("attribute_name") in attributes_list) and (err.code == 1920) # noqa + pred_old = lambda err: ("Attribute" in err.message) and ("is not allowed" in err.message) # noqa + # COMPAT: remove old version + return self.find_matching_error(predicate=pred_new) or self.find_matching_error(predicate=pred_old) + + def is_row_is_blocked(self): + """Row is blocked""" + return self.contains_code(1712) + + def is_blocked_row_wait_timeout(self): + """Timed out waiting on blocked row""" + return self.contains_code(1713) + + def is_chunk_not_preloaded(self): + """Chunk data is not preloaded yet""" + return self.contains_code(1735) + + def is_already_present_in_group(self): + """Member is already present in group""" + return self.contains_code(908) + + +class YtResponseError(YtError): + """Represents an error in YT response.""" + def __init__(self, underlying_error): + super(YtResponseError, self).__init__() + self.message = "Received response with error" + self._underlying_error = underlying_error + self.inner_errors = [self._underlying_error] + + # Common response error properties. + @property + def params(self): + return self.attributes.get("params") + + # HTTP response interface. + @property + def url(self): + """ Returns url for HTTP response error""" + return self.attributes.get("url") + + @property + def headers(self): + """ COMPAT: Returns request headers for HTTP response error""" + return self.attributes.get("request_headers") + + @property + def error(self): + """ COMPAT: Returns underlying error""" + return self._underlying_error + + @property + def request_headers(self): + """ Returns request headers for HTTP response error""" + return self.attributes.get("request_headers") + + @property + def response_headers(self): + """ Returns response headers for HTTP response error""" + return self.attributes.get("response_headers") + + def __reduce__(self): + return (_reconstruct_yt_response_error, (type(self), self.message, self.attributes, self._underlying_error, self.inner_errors)) + + +def _reconstruct_yt_response_error(error_class, message, attributes, underlying_error, inner_errors): + error = error_class(underlying_error) + error.message = message + error.inner_errors = inner_errors + error.attributes = attributes + return error + + +class PrettyPrintableDict(dict): + pass + + +def _pretty_format_escape(value): + def escape(char): + if char in string.printable: + return char + return "\\x{0:02x}".format(ord(char)) + value = value.replace("\n", "\\n").replace("\t", "\\t") + try: + value.encode("utf-8") + return value + except UnicodeDecodeError: + return "".join(imap(escape, value)) + + +def _pretty_format_attribute(name, value, attribute_length_limit): + name = to_native_str(name) + if isinstance(value, PrettyPrintableDict): + value = json.dumps(value, indent=2) + value = value.replace("\n", "\n" + " " * (15 + 1 + 4)) + else: + if isinstance(value, string_types): + value = to_native_str(value) + else: + value = str(value) + value = _pretty_format_escape(value) + if attribute_length_limit is not None and len(value) > attribute_length_limit: + value = value[:attribute_length_limit] + "...message truncated..." + return " " * 4 + "%-15s %s" % (name, value) + + +def _pretty_simplify_error(error): + if isinstance(error, YtError): + error = error.simplify() + elif isinstance(error, (Exception, KeyboardInterrupt)): + error = {"code": 1, "message": str(error)} + return error + + +def _pretty_extract_messages(error, depth=0): + """ + YtError -> [(depth: int, message: str), ...], in tree order. + """ + error = _pretty_simplify_error(error) + + if not error.get("attributes", {}).get("transparent", False): + yield (depth, to_native_str(error["message"])) + depth += 1 + + for inner_error in error.get("inner_errors", []): + for subitem in _pretty_extract_messages(inner_error, depth=depth): + yield subitem + + +def _pretty_format_messages_flat(error): + prev_depth = 0 + result = [] + for depth, message in _pretty_extract_messages(error): + if depth > prev_depth: + result.append(" ") + result.append("(" * (depth - prev_depth)) + elif prev_depth > depth: + result.append(")" * (prev_depth - depth)) + elif result: + result.append(", ") + result.append(repr(message)) + prev_depth = depth + + result.append(")" * prev_depth) + return "".join(result) + + +def _pretty_format_messages(error, indent=0, indent_step=4): + result = [] + for depth, message in _pretty_extract_messages(error): + result.append("{indent}{message}".format( + indent=" " * (indent + depth * indent_step), + message=message)) + + return "\n".join(result) + + +def _pretty_format_full_errors(error, attribute_length_limit): + error = _pretty_simplify_error(error) + + lines = [] + if "message" in error: + lines.append(to_native_str(error["message"])) + + if "code" in error and int(error["code"]) != 1: + lines.append(_pretty_format_attribute( + "code", error["code"], attribute_length_limit=attribute_length_limit)) + + attributes = error.get("attributes", {}) + + origin_keys = ["host", "datetime"] + origin_cpp_keys = ["pid", "tid", "fid"] + if all(key in attributes for key in origin_keys): + date = attributes["datetime"] + if isinstance(date, datetime): + date = date.strftime("%y-%m-%dT%H:%M:%S.%fZ") + value = "{0} on {1}".format(attributes["host"], date) + if all(key in attributes for key in origin_cpp_keys): + value += " (pid %(pid)d, tid %(tid)x, fid %(fid)x)" % attributes + lines.append(_pretty_format_attribute( + "origin", value, attribute_length_limit=attribute_length_limit)) + + location_keys = ["file", "line"] + if all(key in attributes for key in location_keys): + lines.append(_pretty_format_attribute( + "location", + "%(file)s:%(line)d" % attributes, + attribute_length_limit=attribute_length_limit)) + + for key, value in iteritems(attributes): + if key in origin_keys or key in location_keys or key in origin_cpp_keys: + continue + lines.append(_pretty_format_attribute( + key, value, attribute_length_limit=attribute_length_limit)) + + result = (" " * 4 + "\n").join(lines) + if "inner_errors" in error: + for inner_error in error["inner_errors"]: + result += "\n" + _pretty_format_full_errors( + inner_error, attribute_length_limit=attribute_length_limit) + + return result + + +def _pretty_format(error, attribute_length_limit=None): + return "{}\n\n***** Details:\n{}\n".format( + _pretty_format_messages(error), + _pretty_format_full_errors(error, attribute_length_limit=attribute_length_limit)) + + +def _pretty_format_fake(error, attribute_length_limit=None): + return _pretty_format(error, attribute_length_limit) + + +def _pretty_format_for_logging(error, attribute_length_limit=None): + return _pretty_format_full_errors(error, attribute_length_limit=attribute_length_limit).replace("\n", "\\n") + + +def format_error(error, attribute_length_limit=300): + return _pretty_format(error, attribute_length_limit) + + +def join_exceptions(*args): + result = [] + for exception in args: + if isinstance(exception, tuple): + result += exception + else: + result.append(exception) + return tuple(result) + + +def which(name, flags=os.X_OK, custom_paths=None): + """Return list of files in system paths with given name.""" + # TODO: check behavior when dealing with symlinks + result = [] + paths = os.environ.get("PATH", "").split(os.pathsep) + if custom_paths is not None: + paths = custom_paths + paths + for dir in paths: + path = os.path.join(dir, name) + if os.access(path, flags): + result.append(path) + return result + + +def unlist(list): + try: + return list[0] if len(list) == 1 else list + except TypeError: # cannot calculate len + return list + + +def require(condition, exception_func): + if not condition: + raise exception_func() + + +def update_inplace(object, patch): + """Apply patch to object inplace""" + if isinstance(patch, Mapping) and isinstance(object, Mapping): + for key, value in iteritems(patch): + if key in object: + object[key] = update_inplace(object[key], value) + else: + object[key] = value + elif isinstance(patch, list) and isinstance(object, list): + for index, value in enumerate(patch): + if index < len(object): + object[index] = update_inplace(object[index], value) + else: + object.append(value) + else: + object = patch + return object + + +def update(object, patch): + """Apply patch to object without modifying original object or patch""" + if patch is None: + return copy.deepcopy(object) + elif object is None: + return copy.deepcopy(patch) + else: + return update_inplace(copy.deepcopy(object), patch) + + +def flatten(obj, list_types=(list, tuple, set, frozenset, types.GeneratorType)): + """Create flat list from all elements.""" + if isinstance(obj, list_types): + return list(chain(*imap(flatten, obj))) + return [obj] + + +def update_from_env(variables): + """Update variables dict from environment.""" + for key, value in iteritems(os.environ): + prefix = "YT_" + if not key.startswith(prefix): + continue + + key = key[len(prefix):] + if key not in variables: + continue + + var_type = type(variables[key]) + # Using int we treat "0" as false, "1" as "true" + if var_type == bool: + try: + value = int(value) + except: # noqa + pass + # None type is treated as str + if isinstance(None, var_type): + var_type = str + + variables[key] = var_type(value) + + +def get_value(value, default): + if value is None: + return default + else: + return value + + +def filter_dict(predicate, dictionary): + return dict([(k, v) for (k, v) in iteritems(dictionary) if predicate(k, v)]) + + +def set_pdeathsig(signum=None): + if sys.platform.startswith("linux"): + if signum is None: + signum = signal.SIGTERM + if prctl: + prctl.set_pdeathsig(signum) + else: + ctypes.cdll.LoadLibrary("libc.so.6") + libc = ctypes.CDLL("libc.so.6") + PR_SET_PDEATHSIG = 1 + libc.prctl(PR_SET_PDEATHSIG, signum) + + +def remove_file(path, force=False): + try: + os.remove(path) + except OSError: + if not force: + raise + + +def makedirp(path): + try: + os.makedirs(path) + except OSError as err: + if err.errno != errno.EEXIST: + raise + + +def touch(path): + if not os.path.exists(path): + makedirp(os.path.dirname(path)) + with open(path, "w"): + pass + + +def date_string_to_datetime(date): + return datetime.strptime(date, YT_DATETIME_FORMAT_STRING) + + +def date_string_to_timestamp(date): + return calendar.timegm(date_string_to_datetime(date).timetuple()) + + +def date_string_to_timestamp_mcs(time_str): + dt = date_string_to_datetime(time_str) + return int(calendar.timegm(dt.timetuple()) * (10 ** 6) + dt.microsecond) + + +def datetime_to_string(date, is_local=False): + if is_local: + date = datetime.utcfromtimestamp(time.mktime(date.timetuple())) + return date.strftime(YT_DATETIME_FORMAT_STRING) + + +def make_non_blocking(fd): + # Use local import to support Windows. + import fcntl + flags = fcntl.fcntl(fd, fcntl.F_GETFL) + fcntl.fcntl(fd, fcntl.F_SETFL, flags | os.O_NONBLOCK) + + +def to_native_str(string, encoding="utf-8", errors="strict"): + if not PY3 and isinstance(string, text_type): + return string.encode(encoding) + if PY3 and isinstance(string, binary_type): + return string.decode(encoding, errors=errors) + return string + + +def copy_docstring_from(documented_function): + """Decorator that copies docstring from one function to another. + + :param documented_function: function that provides docstring. + + Usage:: + + def foo(...): + "USEFUL DOCUMENTATION" + ... + + @copy_docstring_from(foo) + def bar(...) + # docstring will be copied from `foo' function + ... + """ + return functools.wraps(documented_function, assigned=("__doc__",), updated=()) + + +def is_process_alive(pid): + try: + os.kill(pid, 0) + except OSError as err: + if err.errno == errno.ESRCH: + return False + elif err.errno == errno.EPERM: + return True + else: + # According to "man 2 kill" possible error values are + # (EINVAL, EPERM, ESRCH) + raise + return True + + +def uuid_to_parts(guid): + id_parts = guid.split("-") + id_hi = int(id_parts[2], 16) << 32 | int(id_parts[3], 16) + id_lo = int(id_parts[0], 16) << 32 | int(id_parts[1], 16) + return id_hi, id_lo + + +def parts_to_uuid(id_hi, id_lo): + guid = id_lo << 64 | id_hi + mask = 0xFFFFFFFF + + parts = [] + for i in range(4): + parts.append((guid & mask) >> (i * 32)) + mask <<= 32 + + return "-".join(reversed(["{:x}".format(part) for part in parts])) + + +# TODO(asaitgalin): Remove copy-paste from YP. +def underscore_case_to_camel_case(str): + result = [] + first = True + upper = True + for c in str: + if c == "_": + upper = True + else: + if upper: + if c not in string.ascii_letters and not first: + result.append("_") + c = c.upper() + result.append(c) + upper = False + first = False + return "".join(result) + + +class WaitFailed(Exception): + pass + + +def wait(predicate, error_message=None, iter=None, sleep_backoff=None, timeout=None, ignore_exceptions=False): + # 30 seconds by default + if sleep_backoff is None: + sleep_backoff = 0.3 + + if ignore_exceptions: + def check_predicate(): + try: + return predicate() + # Do not catch BaseException because pytest exceptions are inherited from it + # pytest.fail raises exception inherited from BaseException. + except Exception: + return False + else: + check_predicate = predicate + + if timeout is None: + if iter is None: + iter = 100 + index = 0 + while index < iter: + if check_predicate(): + return + index += 1 + time.sleep(sleep_backoff) + else: + start_time = datetime.now() + while datetime.now() - start_time < timedelta(seconds=timeout): + if check_predicate(): + return + time.sleep(sleep_backoff) + + if inspect.isfunction(error_message): + error_message = error_message() + if error_message is None: + error_message = "Wait failed" + error_message += " (timeout = {0})".format(timeout if timeout is not None else iter * sleep_backoff) + raise WaitFailed(error_message) diff --git a/yt/python/yt/json_wrapper.py b/yt/python/yt/json_wrapper.py new file mode 100644 index 00000000000..2e18c98ca9a --- /dev/null +++ b/yt/python/yt/json_wrapper.py @@ -0,0 +1,24 @@ +try: + from yt.packages.six import iteritems, text_type +except ImportError: + from six import iteritems, text_type + +try: + from simplejson import load, dump, loads, dumps, JSONDecodeError # noqa +except ImportError: + # This version of simplejson has no compiled speedup module. + from yt.packages.simplejson import load, dump, loads, dumps, JSONDecodeError # noqa + + +def loads_as_bytes(*args, **kwargs): + def encode(value): + if isinstance(value, dict): + return dict([(encode(k), encode(v)) for k, v in iteritems(value)]) + elif isinstance(value, list): + return [encode(item) for item in value] + elif isinstance(value, text_type): + return value.encode("utf-8") + else: + return value + + return encode(loads(*args, **kwargs)) diff --git a/yt/python/yt/logger.py b/yt/python/yt/logger.py new file mode 100644 index 00000000000..c885fa9f524 --- /dev/null +++ b/yt/python/yt/logger.py @@ -0,0 +1,72 @@ +from . import logger_config + +try: + import yatest.common as yatest_common +except ImportError: + yatest_common = None + +import logging + + +def set_log_level_from_config(logger): + if not logger_config.LOG_LEVEL: + logger.setLevel(level=logging.__dict__["INFO"]) + else: + if logger_config.LOG_LEVEL.upper() == "NOTSET": + raise Exception("LOG_LEVEL couldn't be defined as NOTSET") + # Intentionally override trace with debug for compatibility with C++ logging library. + if logger_config.LOG_LEVEL.upper() == "TRACE": + logger_config.LOG_LEVEL = "DEBUG" + logger.setLevel(level=logging.__dict__[logger_config.LOG_LEVEL.upper()]) + + +logging.getLogger("yt.packages.requests.packages.urllib3").setLevel(logging.WARNING) + +LOGGER = logging.getLogger("Yt") + +LOGGER.propagate = False + +set_log_level_from_config(LOGGER) + +if logger_config.LOG_PATH is None: + LOGGER.addHandler(logging.StreamHandler()) +else: + LOGGER.addHandler(logging.FileHandler(logger_config.LOG_PATH)) + +BASIC_FORMATTER = logging.Formatter(logger_config.LOG_PATTERN) + +formatter = None + + +def set_formatter(new_formatter): + global formatter + formatter = new_formatter + for handler in LOGGER.handlers: + handler.setFormatter(new_formatter) + + +set_formatter(BASIC_FORMATTER) + + +def debug(msg, *args, **kwargs): + LOGGER.debug(msg, *args, **kwargs) + + +def info(msg, *args, **kwargs): + LOGGER.info(msg, *args, **kwargs) + + +def warning(msg, *args, **kwargs): + LOGGER.warning(msg, *args, **kwargs) + + +def error(msg, *args, **kwargs): + LOGGER.error(msg, *args, **kwargs) + + +def exception(msg, *args, **kwargs): + LOGGER.exception(msg, *args, **kwargs) + + +def log(level, msg, *args, **kwargs): + LOGGER.log(level, msg, *args, **kwargs) diff --git a/yt/python/yt/logger_config.py b/yt/python/yt/logger_config.py new file mode 100644 index 00000000000..45d1bdd1a7c --- /dev/null +++ b/yt/python/yt/logger_config.py @@ -0,0 +1,7 @@ +from .common import update_from_env + +LOG_LEVEL = None +LOG_PATTERN = "%(asctime)-15s\t%(levelname)s\t%(message)s" +LOG_PATH = None + +update_from_env(globals()) diff --git a/yt/python/yt/subprocess_wrapper.py b/yt/python/yt/subprocess_wrapper.py new file mode 100644 index 00000000000..b766cb11fe5 --- /dev/null +++ b/yt/python/yt/subprocess_wrapper.py @@ -0,0 +1,19 @@ +from yt.common import to_native_str + +try: + import subprocess32 as subprocess +except ImportError: + import subprocess + +Popen = subprocess.Popen +PIPE = subprocess.PIPE +STDOUT = subprocess.STDOUT +CalledProcessError = subprocess.CalledProcessError + + +def check_call(*args, **kwargs): + return subprocess.check_call(*args, **kwargs) + + +def check_output(*args, **kwargs): + return to_native_str(subprocess.check_output(*args, **kwargs)) diff --git a/yt/python/yt/type_info/README.md b/yt/python/yt/type_info/README.md new file mode 100644 index 00000000000..65be5d6a5b5 --- /dev/null +++ b/yt/python/yt/type_info/README.md @@ -0,0 +1,2 @@ +# Description +Python library for representing types of Common Yandex Typesystem (complementary to C++ [library/type_info](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/type_info)). diff --git a/yt/python/yt/type_info/__init__.py b/yt/python/yt/type_info/__init__.py new file mode 100644 index 00000000000..aa7268843c5 --- /dev/null +++ b/yt/python/yt/type_info/__init__.py @@ -0,0 +1,11 @@ +from .type_base import ( # noqa + Type, is_valid_type, validate_type, +) + +from .typing import ( # noqa + Bool, Int8, Uint8, Int16, Uint16, Int32, Uint32, Int64, Uint64, Float, + Double, String, Utf8, Yson, Json, Uuid, Date, Datetime, Timestamp, + Interval, TzDate, TzDatetime, TzTimestamp, Void, Null, Optional, List, + Tuple, Dict, Struct, Variant, Tagged, Decimal, EmptyTuple, EmptyStruct, + serialize_yson, deserialize_yson, +) diff --git a/yt/python/yt/type_info/test/lib/conftest.py b/yt/python/yt/type_info/test/lib/conftest.py new file mode 100644 index 00000000000..4fe86467afd --- /dev/null +++ b/yt/python/yt/type_info/test/lib/conftest.py @@ -0,0 +1,30 @@ +import cyson + +import six + +import sys + + +# Emulate module `yt.yson` in tests to get rid of the dependency on yt/python +class FakeYsonModule: + YsonError = ValueError + + @staticmethod + def loads(yson): + reader = cyson.Reader if six.text_type not in six.string_types else cyson.UnicodeReader + return cyson.loads(yson, Reader=reader) + + @staticmethod + def dumps(yson, yson_format="text"): + if hasattr(yson, "to_yson_type"): + yson = yson.to_yson_type() + + return cyson.dumps(yson, format=yson_format) + + +class FakeYtModule: + yson = FakeYsonModule + + +sys.modules["yt"] = FakeYtModule +sys.modules["yt.yson"] = FakeYtModule.yson diff --git a/yt/python/yt/type_info/test/lib/helpers.py b/yt/python/yt/type_info/test/lib/helpers.py new file mode 100644 index 00000000000..1f4c54a14fb --- /dev/null +++ b/yt/python/yt/type_info/test/lib/helpers.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- + +NO_ARGUMENT_TYPES = [ + "Bool", + "Int8", + "Uint8", + "Int16", + "Uint16", + "Int32", + "Uint32", + "Int64", + "Uint64", + "Float", + "Double", + "String", + "Utf8", + "Yson", + "Json", + "Uuid", + "Date", + "Datetime", + "Timestamp", + "Interval", + "TzDate", + "TzDatetime", + "TzTimestamp", + + "Void", + "Null", +] diff --git a/yt/python/yt/type_info/test/lib/test_common.py b/yt/python/yt/type_info/test/lib/test_common.py new file mode 100644 index 00000000000..e4b7e7a585c --- /dev/null +++ b/yt/python/yt/type_info/test/lib/test_common.py @@ -0,0 +1,49 @@ +import yt.type_info as ti + +import pytest +import pkgutil + + +def load_and_parse(path): + testdata = pkgutil.get_data(__package__, path) + testlines = [line for line in testdata.split(b"\n") if not line.startswith(b"#")] + tests = (b"\n".join(testlines)).split(b";;") + + parsed_tests = [tuple(test_arg.strip() for test_arg in test.split(b"::")) + for test in tests] + + return [test for test in parsed_tests if any(test)] + + +def pytest_generate_tests(metafunc): + if metafunc.cls is not TestCommon: + return + + TEST_DATA_PATH_PREFIX = "library/cpp/type_info/ut/test-data" + + if "test_good" in metafunc.function.__name__: + tests = load_and_parse(TEST_DATA_PATH_PREFIX + "/good-types.txt") + argnames = ["yson_type", "string_type"] + else: + tests = load_and_parse(TEST_DATA_PATH_PREFIX + "/bad-types.txt") + argnames = ["yson_type", "error_text"] + + argvalues = [tuple(test[:len(argnames)]) for test in tests] + ids = [str(id) for id in range(1, len(tests) + 1)] + + metafunc.parametrize(argnames, argvalues, ids=ids) + + +class TestCommon: + def test_good(self, yson_type, string_type): + py_type = ti.deserialize_yson(yson_type) + + assert str(py_type) == string_type.decode("utf-8") + + yson_type2 = ti.serialize_yson(py_type, human_readable=True) + py_type2 = ti.deserialize_yson(yson_type2) + assert py_type == py_type2 + + def test_bad(self, yson_type, error_text): + with pytest.raises(ValueError, match=error_text.decode("utf-8")): + ti.deserialize_yson(yson_type) diff --git a/yt/python/yt/type_info/test/lib/test_helpers.py b/yt/python/yt/type_info/test/lib/test_helpers.py new file mode 100644 index 00000000000..86bb85cabdc --- /dev/null +++ b/yt/python/yt/type_info/test/lib/test_helpers.py @@ -0,0 +1,43 @@ +import pytest + +import yt.type_info.typing as typing + +from yt.type_info.type_base import ( + is_valid_type, make_primitive_type, validate_type, quote_string, Type +) + + +def test_valid_type(): + assert is_valid_type(Type) + assert not is_valid_type(int) + assert not is_valid_type(3) + assert is_valid_type(typing.List[typing.Int8]) + + +def test_check_type(): + validate_type(Type) + validate_type(typing.Int64) + with pytest.raises(ValueError): + validate_type(int) + with pytest.raises(ValueError): + validate_type(3) + validate_type(typing.List[typing.Int8]) + + +def test_quoute_string(): + assert quote_string("a'b'c'd") == "'a\\'b\\'c\\'d'" + assert quote_string("abc") == "'abc'" + assert quote_string("") == "''" + assert quote_string("\\bcd\\") == "'\\\\bcd\\\\'" + assert quote_string("\\b'c'd\\") == "'\\\\b\\'c\\'d\\\\'" + assert quote_string(u"хэ\\л'ло") == u"'хэ\\\\л\\'ло'" + + +def test_make_primitive_type(): + MyType = make_primitive_type("MyTypeName") + assert is_valid_type(MyType) + assert str(MyType) == "MyTypeName" + assert MyType.name == "MyTypeName" + # Check no exception is thrown + ListOfMyType = typing.List[MyType] + assert is_valid_type(ListOfMyType) diff --git a/yt/python/yt/type_info/test/lib/test_io.py b/yt/python/yt/type_info/test/lib/test_io.py new file mode 100644 index 00000000000..1b6cca89977 --- /dev/null +++ b/yt/python/yt/type_info/test/lib/test_io.py @@ -0,0 +1,167 @@ +# -*- coding: utf-8 -*- + +from . import helpers +import yt.type_info.typing as typing +from yt.type_info import is_valid_type, serialize_yson, deserialize_yson + +from yt import yson + +import pytest +import six + + +def to_snake_case(s): + res = [] + first = True + for c in s: + if c.isupper(): + if first: + res.append(c.lower()) + else: + res += ["_", c.lower()] + else: + res.append(c) + first = False + return "".join(res) + + [email protected]("human_readable", [True, False]) +def test_primitive_types(human_readable): + for type_name in helpers.NO_ARGUMENT_TYPES: + assert hasattr(typing, type_name) + type_ = getattr(typing, type_name) + serialized = serialize_yson(type_, human_readable=human_readable) + assert deserialize_yson(serialized) == type_ + obj = yson.loads(serialized) + assert obj == to_snake_case(type_name) + + [email protected]("human_readable", [True, False]) +def test_compound_types(human_readable): + def check(type_, expected_obj): + serialized = serialize_yson(type_, human_readable=human_readable) + deserialized = deserialize_yson(serialized) + assert is_valid_type(deserialized) + obj = yson.loads(serialized) + assert obj == expected_obj + assert deserialized == type_ + + check(typing.Optional[typing.Int32], { + "type_name": "optional", + "item": "int32", + }) + + check(typing.Dict[typing.Double, typing.Bool], { + "type_name": "dict", + "key": "double", + "value": "bool", + }) + + check(typing.Tuple[typing.Double, typing.Uuid, typing.Json], { + "type_name": "tuple", + "elements": [ + {"type": "double"}, + {"type": "uuid"}, + {"type": "json"}, + ], + }) + + check(typing.EmptyTuple, {"type_name": "tuple", "elements": []}) + + if six.PY2: + russian_name = u"ой".encode("utf-8") + else: + russian_name = u"ой" + + check( + typing.Struct[ + "a": typing.Uint8, + "b": typing.Yson, + u"ой": typing.Uint64, + ], + { + "type_name": "struct", + "members": [ + {"name": "a", "type": "uint8"}, + {"name": "b", "type": "yson"}, + {"name": russian_name, "type": "uint64"}, + ], + }, + ) + + check(typing.EmptyStruct, {"type_name": "struct", "members": []}) + + variant_struct = typing.Variant[ + "to_be": typing.Null, + "not_to_be": typing.Void, + ] + check(variant_struct, { + "type_name": "variant", + "members": [ + {"name": "to_be", "type": "null"}, + {"name": "not_to_be", "type": "void"}, + ], + }) + + check(typing.Variant[typing.Uint8, typing.Int8], { + "type_name": "variant", + "elements": [ + {"type": "uint8"}, + {"type": "int8"}, + ], + }) + + check(typing.Tagged[variant_struct, "my_tag"], { + "type_name": "tagged", + "tag": "my_tag", + "item": { + "type_name": "variant", + "members": [ + {"name": "to_be", "type": "null"}, + {"name": "not_to_be", "type": "void"}, + ], + }, + }) + + if six.PY2: + russian_tag = u"мой_тэг".encode("utf-8") + else: + russian_tag = u"мой_тэг" + check(typing.Tagged[typing.String, u"мой_тэг"], { + "type_name": "tagged", + "tag": russian_tag, + "item": "string", + }) + + check(typing.Decimal[5, 3], { + "type_name": "decimal", + "precision": 5, + "scale": 3, + }) + + check(typing.Decimal(5, 3), { + "type_name": "decimal", + "precision": 5, + "scale": 3, + }) + + +def test_errors(): + with pytest.raises(TypeError): + serialize_yson("I'm not a type") + + # Malformed YSON. + with pytest.raises(ValueError): + deserialize_yson("}}O_o{{") + + # Nonexistent type. + with pytest.raises(ValueError): + deserialize_yson("{type_name=\"I actually dont exist\"}") + + # More subtle problem with types. + with pytest.raises(ValueError): + deserialize_yson("""{ + type_name=decimal; + precision=10; + scale=10.0; + }""") diff --git a/yt/python/yt/type_info/test/lib/test_typing.py b/yt/python/yt/type_info/test/lib/test_typing.py new file mode 100644 index 00000000000..34d7ece2aac --- /dev/null +++ b/yt/python/yt/type_info/test/lib/test_typing.py @@ -0,0 +1,228 @@ +from . import helpers +import yt.type_info.typing as typing +from yt.type_info import is_valid_type + +import pytest +import six + + +def test_primitive_types(): + for type_name in helpers.NO_ARGUMENT_TYPES: + assert hasattr(typing, type_name) + type_ = getattr(typing, type_name) + assert type_.name == type_name + assert str(type_) == type_name + assert is_valid_type(type_) + + +def test_primitive_type_equality(): + for i, type_name in enumerate(helpers.NO_ARGUMENT_TYPES): + type_ = getattr(typing, type_name) + assert type_ == type_ + assert not(type_ != type_) + type_name_next = helpers.NO_ARGUMENT_TYPES[(i + 1) % len(helpers.NO_ARGUMENT_TYPES)] + type_next = getattr(typing, type_name_next) + assert type_ != type_next + assert not(type_ == type_next) + + +def test_compound_types(): + optional = typing.Optional[typing.Int32] + assert is_valid_type(optional) + assert str(optional) == "Optional<Int32>" + assert optional.name == "Optional" + assert optional.item == typing.Int32 + + list_ = typing.List[typing.String] + assert is_valid_type(list_) + assert str(list_) == "List<String>" + assert list_.name == "List" + assert list_.item == typing.String + + dict_ = typing.Dict[typing.Double, typing.Bool] + assert is_valid_type(dict_) + assert str(dict_) == "Dict<Double, Bool>" + assert dict_.name == "Dict" + assert dict_.key == typing.Double + assert dict_.value == typing.Bool + + tuple_ = typing.Tuple[typing.Double, typing.Uuid, typing.Json] + assert is_valid_type(tuple_) + assert str(tuple_) == "Tuple<Double, Uuid, Json>" + assert tuple_.name == "Tuple" + assert tuple_.items == (typing.Double, typing.Uuid, typing.Json) + + assert is_valid_type(typing.EmptyTuple) + assert str(typing.EmptyTuple) == "Tuple<>" + assert typing.EmptyTuple.name == "Tuple" + assert typing.EmptyTuple.items == tuple() + + struct = typing.Struct[ + "a": typing.Uint8, + "b": typing.Yson, + u"ой": typing.Uint64, + ] + assert is_valid_type(struct) + assert six.ensure_text(str(struct)) == u"Struct<'a': Uint8, 'b': Yson, 'ой': Uint64>" + assert struct.name == "Struct" + assert struct.items == (("a", typing.Uint8), ("b", typing.Yson), (u"ой", typing.Uint64)) + + assert is_valid_type(typing.EmptyStruct) + assert str(typing.EmptyStruct) == "Struct<>" + assert typing.EmptyStruct.name == "Struct" + assert typing.EmptyStruct.items == tuple() + + variant_struct = typing.Variant[ + "to_be": typing.Null, + "not_to_be": typing.Void, + ] + assert is_valid_type(variant_struct) + assert str(variant_struct) == "Variant<'to_be': Null, 'not_to_be': Void>" + assert variant_struct.name == "Variant" + assert variant_struct.items == (("to_be", typing.Null), ("not_to_be", typing.Void)) + + variant_tuple = typing.Variant[typing.Uint8, typing.Int8] + assert is_valid_type(variant_tuple) + assert str(variant_tuple) == "Variant<Uint8, Int8>" + assert variant_tuple.name == "Variant" + assert variant_tuple.items == (typing.Uint8, typing.Int8) + + tagged = typing.Tagged[typing.String, "my_tag"] + assert is_valid_type(tagged) + assert str(tagged) == "Tagged<String, 'my_tag'>" + assert tagged.name == "Tagged" + assert tagged.tag == "my_tag" + assert tagged.item == typing.String + + tagged_ru = typing.Tagged[typing.String, u"мой_тэг"] + assert is_valid_type(tagged_ru) + assert six.ensure_text(str(tagged_ru)) == u"Tagged<String, 'мой_тэг'>" + assert tagged_ru.name == "Tagged" + assert tagged_ru.tag == u"мой_тэг" + assert tagged_ru.item == typing.String + + decimal_getitem = typing.Decimal[5, 3] + assert is_valid_type(decimal_getitem) + assert str(decimal_getitem) == "Decimal(5, 3)" + assert decimal_getitem.name == "Decimal" + assert decimal_getitem.precision == 5 + assert decimal_getitem.scale == 3 + + decimal_call = typing.Decimal(5, 3) + assert is_valid_type(decimal_call) + assert str(decimal_call) == "Decimal(5, 3)" + assert decimal_call.name == "Decimal" + assert decimal_call.precision == 5 + assert decimal_call.scale == 3 + + +def test_compound_type_equality(): + optional1 = typing.Optional[typing.Int32] + optional2 = typing.Optional[typing.Int32] + optional3 = typing.Optional[typing.Int64] + assert optional1 == optional1 + assert optional1 == optional2 + assert optional1 != optional3 + + list1 = typing.List[typing.String] + list2 = typing.List[typing.String] + list3 = typing.List[typing.Optional[typing.String]] + assert list1 == list1 + assert list1 == list2 + assert list1 != list3 + + dict1 = typing.Dict[typing.Double, typing.Bool] + dict2 = typing.Dict[typing.Double, typing.Bool] + dict3 = typing.Dict[typing.Optional[typing.Double], typing.Bool] + assert dict1 == dict1 + assert dict1 == dict2 + assert dict1 != dict3 + + tuple1 = typing.Tuple[typing.Double, typing.Uuid, typing.Json] + tuple2 = typing.Tuple[typing.Double, typing.Uuid, typing.Json] + tuple3 = typing.Tuple[typing.Double, typing.Json, typing.Uuid] + assert tuple1 == tuple1 + assert tuple1 == tuple2 + assert tuple1 != tuple3 + + assert typing.EmptyTuple == typing.EmptyTuple + + struct1 = typing.Struct["a": typing.Uint8, "b": typing.Yson, u"ой": typing.Uint64] + struct2 = typing.Struct["a": typing.Uint8, "b": typing.Yson, u"ой": typing.Uint64] + struct3 = typing.Struct["a": typing.Uint8, "B": typing.Yson, u"ой": typing.Uint64] + assert struct1 == struct1 + assert struct1 == struct2 + assert struct1 != struct3 + + assert typing.EmptyStruct == typing.EmptyStruct + assert typing.EmptyStruct != typing.EmptyTuple + + variant_struct1 = typing.Variant["to_be": typing.Null, "not_to_be": typing.Void] + variant_struct2 = typing.Variant["to_be": typing.Null, "not_to_be": typing.Void] + variant_struct3 = typing.Variant["to_be": typing.Void, "not_to_be": typing.Void] + assert variant_struct1 == variant_struct1 + assert variant_struct1 == variant_struct2 + assert variant_struct1 != variant_struct3 + + variant_tuple1 = typing.Variant[typing.Uint8, typing.Int8] + variant_tuple2 = typing.Variant[typing.Uint8, typing.Int8] + variant_tuple3 = typing.Variant[typing.Uint8, typing.Int8, typing.Int16] + assert variant_tuple1 == variant_tuple1 + assert variant_tuple1 == variant_tuple2 + assert variant_tuple1 != variant_tuple3 + + tagged1 = typing.Tagged[typing.String, "my_tag"] + tagged2 = typing.Tagged[typing.String, "my_tag"] + tagged3 = typing.Tagged[typing.String, "my_tag1"] + assert tagged1 == tagged1 + assert tagged1 == tagged2 + assert tagged1 != tagged3 + + tagged_ru1 = typing.Tagged[typing.String, u"мой_тэг"] + tagged_ru2 = typing.Tagged[typing.String, u"мой_тэг"] + tagged_ru3 = typing.Tagged[typing.String, u"мой_тэг1"] + assert tagged_ru1 == tagged_ru1 + assert tagged_ru1 == tagged_ru2 + assert tagged_ru1 != tagged_ru3 + + decimal_getitem1 = typing.Decimal[5, 3] + decimal_getitem2 = typing.Decimal[5, 3] + decimal_getitem3 = typing.Decimal[5, 4] + assert decimal_getitem1 == decimal_getitem1 + assert decimal_getitem1 == decimal_getitem2 + assert decimal_getitem1 != decimal_getitem3 + + decimal_call1 = typing.Decimal(5, 3) + decimal_call2 = typing.Decimal(5, 3) + decimal_call3 = typing.Decimal(5, 4) + assert decimal_call1 == decimal_call1 + assert decimal_call1 == decimal_call2 + assert decimal_call1 != decimal_call3 + + assert decimal_call1 == decimal_getitem1 + assert decimal_call2 == decimal_getitem2 + assert decimal_call3 == decimal_getitem3 + + +def test_errors(): + for T in [typing.Struct, typing.Variant]: + with pytest.raises(ValueError): + T[b"\xff": typing.Int8] + + with pytest.raises(ValueError): + T[10: typing.Int8] + + with pytest.raises(ValueError): + T["a": typing.Int8, "a": typing.Int8] + + with pytest.raises(ValueError): + T["": typing.Int8] + + with pytest.raises(ValueError): + typing.Tagged[typing.Int8, 10] + + with pytest.raises(ValueError): + typing.Tagged[typing.Int8, b"\xff"] + + with pytest.raises(ValueError): + typing.Decimal(10, 10.0) diff --git a/yt/python/yt/type_info/test/lib/ya.make b/yt/python/yt/type_info/test/lib/ya.make new file mode 100644 index 00000000000..ffbccab8508 --- /dev/null +++ b/yt/python/yt/type_info/test/lib/ya.make @@ -0,0 +1,22 @@ +PY23_LIBRARY() + +TEST_SRCS( + conftest.py + test_common.py + test_helpers.py + test_typing.py + test_io.py + helpers.py +) + +PEERDIR( + library/python/cyson + yt/python/yt/type_info +) + +RESOURCE_FILES( + library/cpp/type_info/ut/test-data/good-types.txt + library/cpp/type_info/ut/test-data/bad-types.txt +) + +END() diff --git a/yt/python/yt/type_info/test/py2/ya.make b/yt/python/yt/type_info/test/py2/ya.make new file mode 100644 index 00000000000..0233403a735 --- /dev/null +++ b/yt/python/yt/type_info/test/py2/ya.make @@ -0,0 +1,7 @@ +PY2TEST() + +PEERDIR( + yt/python/yt/type_info/test/lib +) + +END() diff --git a/yt/python/yt/type_info/test/py3/ya.make b/yt/python/yt/type_info/test/py3/ya.make new file mode 100644 index 00000000000..e1210482c08 --- /dev/null +++ b/yt/python/yt/type_info/test/py3/ya.make @@ -0,0 +1,7 @@ +PY3TEST() + +PEERDIR( + yt/python/yt/type_info/test/lib +) + +END() diff --git a/yt/python/yt/type_info/test/ya.make b/yt/python/yt/type_info/test/ya.make new file mode 100644 index 00000000000..1363b76581c --- /dev/null +++ b/yt/python/yt/type_info/test/ya.make @@ -0,0 +1,5 @@ +RECURSE( + lib + py2 + py3 +) diff --git a/yt/python/yt/type_info/type_base.py b/yt/python/yt/type_info/type_base.py new file mode 100644 index 00000000000..57b5452e296 --- /dev/null +++ b/yt/python/yt/type_info/type_base.py @@ -0,0 +1,109 @@ +import six + +from abc import ABCMeta, abstractmethod + + +def _with_type(x): + return "<type: {!s}>: {!r}".format(type(x), x) + + +def _is_utf8(s): + if not isinstance(s, six.string_types): + return False + + if isinstance(s, six.binary_type): + try: + s.decode("utf-8", errors="strict") + return True + except UnicodeDecodeError: + return False + return isinstance(s, six.text_type) + + +def _as_utf8(s): + if isinstance(s, six.text_type): + return s + elif isinstance(s, six.binary_type): + return s.decode("utf-8", errors="strict") + else: + raise TypeError("expected string or binary type, but got {}".format(type(s))) + + +def is_valid_type(x): + return isinstance(x, Type) or x is Type + + +def validate_type(x): + if not is_valid_type(x): + raise ValueError("Expected type, but got {}".format(_with_type(x))) + + +def quote_string(s): + return u"'{}'".format(s.replace("\\", "\\\\").replace("'", "\\'")) + + +class Type(six.with_metaclass(ABCMeta)): + REQUIRED_ATTRS = ["name", "yt_type_name"] + + def __init__(self, attrs): + assert all(key in attrs for key in self.REQUIRED_ATTRS), \ + "One or more of required arguments not found in attributes" + + for name, value in attrs.items(): + setattr(self, name, value) + + def __eq__(self, other): + assert isinstance(self, Type) + + if not isinstance(other, Type): + return False + + return str(self) == str(other) + + def __ne__(self, other): + return not (self == other) + + def __hash__(self): + return hash(str(self)) + + @abstractmethod + def __str__(self): + pass + + @abstractmethod + def to_yson_type(self): + pass + + +class Primitive(Type): + def __str__(self): + return self.name + + def to_yson_type(self): + return self.yt_type_name + + +class Generic(six.with_metaclass(ABCMeta)): + def __init__(self, name, yt_type_name=None): + assert _is_utf8(name), "Name must be UTF-8, got {}".format(_with_type(name)) + self.name = name + self.yt_type_name = name.lower() + + @abstractmethod + def __getitem__(self, param): + pass + + @abstractmethod + def from_dict(self): + pass + + +def make_primitive_type(name, yt_type_name=None): + assert _is_utf8(name), "Name of primitive type must be UTF-8, got {}".format(_with_type(name)) + assert yt_type_name is None or _is_utf8(yt_type_name), \ + "YT type name of primitive type must be UTF-8, got {}".format(_with_type(name)) + + if yt_type_name is None: + yt_type_name = name.lower() + + return Primitive({"name": name, "yt_type_name": yt_type_name}) diff --git a/yt/python/yt/type_info/typing.py b/yt/python/yt/type_info/typing.py new file mode 100644 index 00000000000..07cd5c189cb --- /dev/null +++ b/yt/python/yt/type_info/typing.py @@ -0,0 +1,443 @@ +from . import type_base + +# Backward compatibility +from .type_base import ( # noqa + is_valid_type, validate_type, Type +) + +try: + import yt.yson + _TI_SERIALIZATION_AVAILABLE = True +except ImportError: + _TI_SERIALIZATION_AVAILABLE = False + +import six + + [email protected]_2_unicode_compatible +class _SingleArgumentGenericAlias(type_base.Type): + REQUIRED_ATTRS = type_base.Type.REQUIRED_ATTRS + ["item"] + + def __str__(self): + return "{}<{}>".format(self.name, self.item) + + def to_yson_type(self): + return {"type_name": self.name.lower(), "item": self.item.to_yson_type()} + + +class _SingleArgumentGeneric(type_base.Generic): + def __getitem__(self, param): + type_base.validate_type(param) + + attrs = { + "name": self.name, + "yt_type_name": self.yt_type_name, + "item": param, + } + + return _SingleArgumentGenericAlias(attrs) + + def from_dict(self, type_): + _validate_contains(type_, "item") + param = _parse_type(type_["item"]) + return self.__getitem__(param) + + +def _make_tuple_attrs(items, type_name, yt_type_name): + for item in items: + type_base.validate_type(item) + + attrs = { + "name": type_name, + "yt_type_name": yt_type_name, + "items": items, + } + + return attrs + + [email protected]_2_unicode_compatible +class _TupleAlias(type_base.Type): + REQUIRED_ATTRS = type_base.Type.REQUIRED_ATTRS + ["items"] + + def __str__(self): + return "{}<{}>".format(self.name, ", ".join(str(x) for x in self.items)) + + def to_yson_type(self): + yson_repr = { + "type_name": self.yt_type_name, + "elements": [{"type": item.to_yson_type()} for item in self.items] + } + return yson_repr + + +class _GenericTuple(type_base.Generic): + def __getitem__(self, params): + if not isinstance(params, tuple): + params = (params,) + return _TupleAlias(_make_tuple_attrs(params, self.name, self.yt_type_name)) + + def from_dict(self, type_): + _validate_contains(type_, "elements") + + elements = type_["elements"] + _validate(isinstance(elements, list), "\"elements\" must contain a list") + _validate(all(isinstance(elem, dict) for elem in elements), "\"elements\" must contain a list of maps") + + params = [] + for elem in elements: + _validate_contains(elem, "type") + params.append(_parse_type(elem["type"])) + + return self.__getitem__(tuple(params)) + + [email protected]_2_unicode_compatible +class _DictAlias(type_base.Type): + REQUIRED_ATTRS = type_base.Type.REQUIRED_ATTRS + ["key", "value"] + + def __str__(self): + return "{}<{}, {}>".format(self.name, self.key, self.value) + + def to_yson_type(self): + return {"type_name": self.yt_type_name, "key": self.key.to_yson_type(), "value": self.value.to_yson_type()} + + +class _GenericDict(type_base.Generic): + def __getitem__(self, params): + if not isinstance(params, tuple) or len(params) != 2: + raise ValueError("Expected two types in Dict, but got {}".format(type_base._with_type(params))) + key, value = params + type_base.validate_type(key) + type_base.validate_type(value) + + attrs = { + "name": self.name, + "yt_type_name": self.yt_type_name, + "key": key, + "value": value, + } + + return _DictAlias(attrs) + + def from_dict(self, type_): + _validate_contains(type_, "key") + _validate_contains(type_, "value") + key = _parse_type(type_["key"]) + value = _parse_type(type_["value"]) + return self.__getitem__((key, value)) + + +def _validate_struct_fields(items): + names = set() + for item_name, type_ in items: + if not type_base._is_utf8(item_name): + raise ValueError("Name of struct field must be UTF-8, got {}".format(type_base._with_type(item_name))) + + if not item_name: + raise ValueError("Empty field name is not allowed") + + if item_name in names: + raise ValueError("Duplicate fields are not allowed: {}".format(item_name)) + + names.add(item_name) + + +def _make_struct_attrs(params, type_name, yt_type_name): + for x in params: + if not ( + isinstance(x, slice) and + type_base._is_utf8(x.start) and + type_base.is_valid_type(x.stop) and + x.step is None + ): + raise ValueError("Expected slice in form of field:type, but got {}".format(type_base._with_type(x))) + + items = tuple((type_base._as_utf8(x.start), x.stop) for x in params) + _validate_struct_fields(items) + + attrs = { + "name": type_name, + "yt_type_name": yt_type_name, + "items": items, + } + + return attrs + + [email protected]_2_unicode_compatible +class _StructAlias(type_base.Type): + REQUIRED_ATTRS = type_base.Type.REQUIRED_ATTRS + ["items"] + + def __str__(self): + params_str = ", ".join( + u"{}: {}".format(type_base.quote_string(name), str(type_)) + for name, type_ in self.items + ) + return u"{}<{}>".format(self.name, params_str) + + def to_yson_type(self): + yson_repr = { + "type_name": self.yt_type_name, + "members": [ + {"name": name, "type": type_.to_yson_type()} + for name, type_ in self.items + ] + } + return yson_repr + + +class _GenericStruct(type_base.Generic): + def __getitem__(self, params): + if not isinstance(params, tuple): + params = (params,) + return _StructAlias(_make_struct_attrs(params, self.name, self.yt_type_name)) + + def from_dict(self, type_): + _validate_contains(type_, "members") + + members = type_["members"] + _validate(isinstance(members, list), "\"members\" must contain a list") + _validate(all(isinstance(member, dict) for member in members), "\"members\" must contain a list of maps") + + params = [] + for member in members: + _validate_contains(member, "name") + _validate_contains(member, "type") + + _validate(type_base._is_utf8(member["name"]), "\"name\" must contain a string") + params.append(slice(member["name"], _parse_type(member["type"]))) + + return self.__getitem__(tuple(params)) + + +class _GenericVariant(type_base.Generic): + def __getitem__(self, params): + if not isinstance(params, tuple): + params = (params,) + + if isinstance(params[0], slice): + attrs = _make_struct_attrs(params, self.name, self.yt_type_name) + attrs["underlying"] = "Struct" + return _StructAlias(attrs) + else: + attrs = _make_tuple_attrs(params, self.name, self.yt_type_name) + attrs["underlying"] = "Tuple" + return _TupleAlias(attrs) + + def from_dict(self, type_): + _validate("elements" in type_ or "members" in type_, "missing both keys \"members\" and \"elements\"") + _validate(not ("elements" in type_ and "members" in type_), "both keys \"members\" and \"elements\" are present") + + if "elements" in type_: + cls = Tuple.from_dict(type_) + cls.underlying = Tuple.name + else: + cls = Struct.from_dict(type_) + cls.underlying = Struct.name + + cls.name = self.name + cls.yt_type_name = self.yt_type_name + return cls + + [email protected]_2_unicode_compatible +class _TaggedAlias(type_base.Type): + REQUIRED_ATTRS = type_base.Type.REQUIRED_ATTRS + ["item", "tag"] + + def __str__(self): + return u"{}<{}, {}>".format(self.name, str(self.item), type_base.quote_string(self.tag)) + + def to_yson_type(self): + yson_repr = { + "type_name": self.yt_type_name, + "item": self.item.to_yson_type(), + "tag": self.tag, + } + return yson_repr + + +class _GenericTagged(type_base.Generic): + def __getitem__(self, params): + if not ( + isinstance(params, tuple) and + len(params) == 2 and + type_base.is_valid_type(params[0]) and + (isinstance(params[1], six.string_types) or isinstance(params[1], bytes)) + ): + raise ValueError("Expected type and tag, but got {}".format(type_base._with_type(params))) + item, tag = params + if not type_base._is_utf8(tag): + raise ValueError("Tag must be UTF-8, got {}".format(type_base._with_type(tag))) + tag = type_base._as_utf8(tag) + + attrs = { + "name": self.name, + "yt_type_name": self.yt_type_name, + "item": item, + "tag": tag, + } + + return _TaggedAlias(attrs) + + def from_dict(self, type_): + _validate_contains(type_, "tag") + _validate_contains(type_, "item") + + tag = type_["tag"] + _validate(type_base._is_utf8(tag), "\"tag\" must contain a string") + + item = _parse_type(type_["item"]) + return self.__getitem__((item, tag,)) + + [email protected]_2_unicode_compatible +class _DecimalAlias(type_base.Type): + REQUIRED_ATTRS = type_base.Type.REQUIRED_ATTRS + ["precision", "scale"] + + def __str__(self): + return "{}({}, {})".format(self.name, self.precision, self.scale) + + def to_yson_type(self): + yson_repr = { + "type_name": self.yt_type_name, + "precision": self.precision, + "scale": self.scale, + } + return yson_repr + + +class _GenericDecimal(type_base.Generic): + def _create_alias(self, precision, scale): + if not isinstance(precision, six.integer_types): + raise ValueError("Expected integer, but got {}".format(type_base._with_type(precision))) + + if not isinstance(scale, six.integer_types): + raise ValueError("Expected integer, but got {}".format(type_base._with_type(scale))) + + attrs = { + "name": self.name, + "yt_type_name": self.yt_type_name, + "precision": precision, + "scale": scale, + } + + return _DecimalAlias(attrs) + + def __call__(self, precision, scale): + return self._create_alias(precision, scale) + + def __getitem__(self, params): + if not isinstance(params, tuple) or len(params) != 2: + raise ValueError("Expected precision and scale integer parameters") + + precision, scale = params + return self._create_alias(precision, scale) + + def from_dict(self, type_): + _validate_contains(type_, "precision") + _validate_contains(type_, "scale") + + return self.__getitem__((type_["precision"], type_["scale"],)) + + +Bool = type_base.make_primitive_type("Bool") +Int8 = type_base.make_primitive_type("Int8") +Uint8 = type_base.make_primitive_type("Uint8") +Int16 = type_base.make_primitive_type("Int16") +Uint16 = type_base.make_primitive_type("Uint16") +Int32 = type_base.make_primitive_type("Int32") +Uint32 = type_base.make_primitive_type("Uint32") +Int64 = type_base.make_primitive_type("Int64") +Uint64 = type_base.make_primitive_type("Uint64") +Float = type_base.make_primitive_type("Float") +Double = type_base.make_primitive_type("Double") +String = type_base.make_primitive_type("String") +Utf8 = type_base.make_primitive_type("Utf8") +Yson = type_base.make_primitive_type("Yson") +Json = type_base.make_primitive_type("Json") +Uuid = type_base.make_primitive_type("Uuid") +Date = type_base.make_primitive_type("Date") +Datetime = type_base.make_primitive_type("Datetime") +Timestamp = type_base.make_primitive_type("Timestamp") +Interval = type_base.make_primitive_type("Interval") +TzDate = type_base.make_primitive_type("TzDate", yt_type_name="tz_date") +TzDatetime = type_base.make_primitive_type("TzDatetime", yt_type_name="tz_datetime") +TzTimestamp = type_base.make_primitive_type("TzTimestamp", yt_type_name="tz_timestamp") + +Void = type_base.make_primitive_type("Void") +Null = type_base.make_primitive_type("Null") + +Optional = _SingleArgumentGeneric("Optional") +List = _SingleArgumentGeneric("List") +Tuple = _GenericTuple("Tuple") +Dict = _GenericDict("Dict") +Struct = _GenericStruct("Struct") +Variant = _GenericVariant("Variant") +Tagged = _GenericTagged("Tagged") +Decimal = _GenericDecimal("Decimal") + + +EmptyTuple = Tuple.__getitem__(tuple()) +EmptyStruct = Struct.__getitem__(tuple()) + +PRIMITIVES = {type_.yt_type_name: type_ for type_ in locals().values() if isinstance(type_, type_base.Primitive)} +GENERICS = {type_.yt_type_name: type_ for type_ in locals().values() if isinstance(type_, type_base.Generic)} + + +def _validate(condition, *error_args): + if not condition: + raise ValueError(*error_args) + + +def _validate_contains(dict, key): + _validate(key in dict, "missing required key \"{}\"".format(key)) + + +def _parse_type(type_description): + if isinstance(type_description, six.string_types): + _validate(type_description in PRIMITIVES, "unknown type \"{}\"".format(type_description)) + return PRIMITIVES[type_description] + + _validate(isinstance(type_description, dict), + "type must be either a string or a map, got {}".format(type_base._with_type(type_description))) + + _validate_contains(type_description, "type_name") + + type_name = type_description["type_name"] + _validate(isinstance(type_name, six.string_types), "\"type_name\" must contain a string") + + _validate(type_name in PRIMITIVES or type_name in GENERICS, "unknown type \"{}\"".format(type_name)) + + if type_name in PRIMITIVES: + return PRIMITIVES[type_name] + + return GENERICS[type_name].from_dict(type_description) + + +def _check_serialization_available(): + if not _TI_SERIALIZATION_AVAILABLE: + raise ImportError("Module `yt.yson` is required to use type_info serialization. " + "Make sure you have yt/python/yt/yson in your PEERDIR") + + +def serialize_yson(type_, human_readable=False): + _check_serialization_available() + + if not type_base.is_valid_type(type_): + raise TypeError("serialize_yson can be called only for types") + + return yt.yson.dumps(type_, yson_format="pretty" if human_readable else "binary") + + +def deserialize_yson(yson): + _check_serialization_available() + + if type(yson) is six.text_type and six.PY3: + yson = yson.encode("utf-8") + + try: + type_description = yt.yson.loads(yson) + return _parse_type(type_description) + except (ValueError, yt.yson.YsonError) as e: + six.raise_from(ValueError("deserialization failed: {}".format(e)), e) diff --git a/yt/python/yt/type_info/ya.make b/yt/python/yt/type_info/ya.make new file mode 100644 index 00000000000..22777dca01d --- /dev/null +++ b/yt/python/yt/type_info/ya.make @@ -0,0 +1,19 @@ +PY23_LIBRARY() + +PY_SRCS( + NAMESPACE yt.type_info + + __init__.py + typing.py + type_base.py +) + +PEERDIR( + contrib/python/six +) + +END() + +RECURSE( + test +) diff --git a/yt/python/yt/ya.make b/yt/python/yt/ya.make new file mode 100644 index 00000000000..7d72655d65d --- /dev/null +++ b/yt/python/yt/ya.make @@ -0,0 +1,26 @@ +PY23_LIBRARY() + +PEERDIR( + contrib/python/simplejson + contrib/python/six + yt/python/yt/type_info +) + +IF(LINUX) + PEERDIR( + library/python/prctl + ) +ENDIF() + +PY_SRCS( + NAMESPACE yt + + __init__.py + common.py + json_wrapper.py + logger.py + logger_config.py + subprocess_wrapper.py +) + +END() diff --git a/yt/python/yt/yson/__init__.py b/yt/python/yt/yson/__init__.py new file mode 100644 index 00000000000..1bfcd8595e7 --- /dev/null +++ b/yt/python/yt/yson/__init__.py @@ -0,0 +1,86 @@ +""" +YSON library. + +Package supports `YT YSON format <https://ytsaurus.tech/docs/en/user-guide/storage/yson>`_. + +Package provides special classes for all yson types, see :mod:`yson_types <yt.yson.yson_types>` module. +Also it provides methods for serialization and deserialization yson data: +:func:`load <yt.yson.parser.load>`, :func:`loads <yt.yson.parser.loads>`, +:func:`dump <yt.yson.writer.dump>`, :func:`dumps <yt.yson.writer.dumps>`. +And finally it provides method :func:`to_yson_type <yt.yson.convert.to_yson_type>` for conversion +python objects to special yson types. + +In special variable `TYPE` you can find implementation type of the library. +In equals "BINARY" if c++ bindings found and "PYTHON" otherwise. + +Examples: + +>>> import yt.yson as yson +>>> yson.loads("{a=10}") +{'a': 10} + +>>> yson.dumps(True) +'"true"' + +>>> number = yson.YsonInteger(10) +>>> number.attributes["my_attr"] = "hello" +>>> yson.dumps(number) +'<"attr"="hello">10' + +>>> boolean = to_yson_type(False, attributes={"my_attr": "my_value"}) + +""" + +from __future__ import print_function + +from . import writer # noqa +from . import parser # noqa +from . import yson_types # noqa + +TYPE = None +try: + from yt_yson_bindings import load, loads, dump, dumps, dump_parquete # noqa + TYPE = "BINARY" +except ImportError as error: + # XXX(asaitgalin): Sometimes module can't be imported because + # it depends on missing dynamic libraries (e.g. libatomic). In this case + # diagnostic is printed to stderr. + message = str(error) + if "No module named" not in message: + import sys as _sys + print("Warning! Failed to import YSON bindings: " + message, file=_sys.stderr) + +if TYPE is None: + from .parser import load, loads # noqa + from .writer import dump, dumps # noqa + TYPE = "PYTHON" + +from .yson_types import ( # noqa + YsonString, YsonUnicode, YsonInt64, YsonUint64, YsonDouble, + YsonBoolean, YsonList, YsonMap, YsonEntity, YsonType, YsonStringProxy, + is_unicode, get_bytes, make_byte_key) + +from .convert import to_yson_type, yson_to_json, json_to_yson # noqa +from .common import YsonError # noqa + + +def _loads_from_native_str(string, encoding="utf-8", **kwargs): + import sys + + if sys.version_info[0] < 3: + return loads(string, **kwargs) + + if isinstance(string, str): + string = string.encode(encoding) + + return loads(string, encoding=encoding, **kwargs) + + +def _dumps_to_native_str(obj, encoding="utf-8", **kwargs): + import sys + + if sys.version_info[0] < 3: + return dumps(obj, **kwargs) + + s = dumps(obj, encoding=encoding, **kwargs) + return s.decode(encoding) diff --git a/yt/python/yt/yson/common.py b/yt/python/yt/yson/common.py new file mode 100644 index 00000000000..a50526c0557 --- /dev/null +++ b/yt/python/yt/yson/common.py @@ -0,0 +1,69 @@ +from yt.common import YtError + +try: + from yt.packages.six import int2byte, indexbytes +except ImportError: + from six import int2byte, indexbytes + + +class YsonError(YtError): + pass + + +def raise_yson_error(message, position_info): + line_index, position, offset = position_info + raise YsonError(message, attributes={"line": line_index, "position": position, "offset": offset}) + + +class StreamWrap(object): + def __init__(self, stream, header, footer): + self.stream = stream + self.header = header + self.footer = footer + + self.pos = 0 + self.state = 0 + + def read(self, n): + if n == 0: + return self.stream.read(0) + + assert n == 1 + + if self.state == 0: + if self.pos == len(self.header): + self.state += 1 + else: + res = int2byte(indexbytes(self.header, self.pos)) + self.pos += 1 + return res + + if self.state == 1: + sym = self.stream.read(1) + if sym: + return sym + else: + self.state += 1 + self.pos = 0 + + if self.state == 2: + if self.pos == len(self.footer): + self.state += 1 + else: + res = int2byte(indexbytes(self.footer, self.pos)) + self.pos += 1 + return res + + if self.state == 3: + return b"" + + +_ENCODING_SENTINEL = object() + +# Binary literals markers +STRING_MARKER = int2byte(1) +INT64_MARKER = int2byte(2) +DOUBLE_MARKER = int2byte(3) +FALSE_MARKER = int2byte(4) +TRUE_MARKER = int2byte(5) +UINT64_MARKER = int2byte(6) diff --git a/yt/python/yt/yson/convert.py b/yt/python/yt/yson/convert.py new file mode 100644 index 00000000000..3868fa42441 --- /dev/null +++ b/yt/python/yt/yson/convert.py @@ -0,0 +1,162 @@ +from .yson_types import ( + YsonType, YsonString, YsonUnicode, YsonBoolean, YsonInt64, YsonUint64, YsonDouble, + YsonList, YsonMap, YsonEntity) +from .common import YsonError + +try: + from yt.packages.six import text_type, binary_type, integer_types, iteritems, PY3 + from yt.packages.six.moves import map as imap +except ImportError: + from six import text_type, binary_type, integer_types, iteritems, PY3 + from six.moves import map as imap + +import copy + + +def to_yson_type(value, attributes=None, always_create_attributes=True, encoding="utf-8"): + """Wraps value with YSON type.""" + if not always_create_attributes and attributes is None: + if isinstance(value, text_type) and not PY3: + return value.encode("utf-8") + return value + + if isinstance(value, YsonType): + if attributes is not None: + value = copy.deepcopy(value) + value.attributes = attributes + return value + + if isinstance(value, text_type): + if PY3: + result = YsonUnicode(value) + else: # COMPAT + result = YsonString(value.encode("utf-8")) + elif isinstance(value, binary_type): + result = YsonString(value) + elif value is False or value is True: + result = YsonBoolean(value) + elif isinstance(value, integer_types): + if value < -2 ** 63 or value >= 2 ** 64: + raise TypeError("Integer {0} cannot be represented in YSON " + "since it is out of range [-2^63, 2^64 - 1])".format(value)) + greater_than_max_int64 = value >= 2 ** 63 + if greater_than_max_int64 or isinstance(value, YsonUint64): + result = YsonUint64(value) + else: + result = YsonInt64(value) + elif isinstance(value, float): + result = YsonDouble(value) + elif isinstance(value, list): + result = YsonList(value) + elif isinstance(value, dict): + result = YsonMap(value) + else: + result = YsonEntity() + + if attributes is not None: + result.attributes = attributes + else: + result.attributes = {} + + return result + + +# TODO(ignat): Should we make auto-detection for use_byte_strings? +def json_to_yson(json_tree, use_byte_strings=None): + """Converts json representation to YSON representation.""" + def to_literal(string): + if use_byte_strings: + return string.encode("ascii") + else: + return string + + def decode_key(string): + # In yt wrapper we expect here correct keys, but other usages in arcadia could not give this guarantee. + # TODO(ignat): fix this usages. + if use_byte_strings: + if not isinstance(string, binary_type): + string = string.encode("ascii") + else: + if not isinstance(string, text_type): + string = string.decode("ascii") + + if string.startswith(to_literal("$")): + if not string.startswith(to_literal("$$")): + raise YsonError("Keys should not start with single dollar sign") + string = string[1:] + return string + + if use_byte_strings is None: + use_byte_strings = not PY3 + + has_attrs = isinstance(json_tree, dict) and to_literal("$value") in json_tree + value = json_tree[to_literal("$value")] if has_attrs else json_tree + if isinstance(value, text_type): + result = YsonUnicode(value) + elif isinstance(value, binary_type): + result = YsonString(value) + elif value is False or value is True: + result = YsonBoolean(value) + elif isinstance(value, integer_types): + greater_than_max_int64 = value >= 2 ** 63 + if greater_than_max_int64: + result = YsonUint64(value) + else: + result = YsonInt64(value) + elif isinstance(value, float): + result = YsonDouble(value) + elif isinstance(value, list): + result = YsonList(imap(lambda item: json_to_yson(item, use_byte_strings=use_byte_strings), value)) + elif isinstance(value, dict): + result = YsonMap((decode_key(k), json_to_yson(v, use_byte_strings=use_byte_strings)) for k, v in iteritems(YsonMap(value))) + elif value is None: + result = YsonEntity() + else: + raise YsonError("Unknown type:", type(value)) + + if has_attrs and json_tree.get(to_literal("$attributes"), {}): + result.attributes = json_to_yson(json_tree[to_literal("$attributes")], use_byte_strings=use_byte_strings) + return result + + +def yson_to_json(yson_tree, print_attributes=True): + def encode_key(key): + if PY3 and isinstance(key, binary_type): + key = key.decode("ascii") + if key and key[0] == "$": + return "$" + key + return key + + def process_dict(d): + return dict((encode_key(k), yson_to_json(v)) for k, v in iteritems(d)) + + if hasattr(yson_tree, "attributes") and yson_tree.attributes and print_attributes: + return {"$attributes": process_dict(yson_tree.attributes), + "$value": yson_to_json(yson_tree, print_attributes=False)} + if isinstance(yson_tree, list): + return list(imap(yson_to_json, yson_tree)) + elif isinstance(yson_tree, dict): + return process_dict(yson_tree) + elif isinstance(yson_tree, YsonEntity): + return None + elif PY3 and (isinstance(yson_tree, YsonString) or isinstance(yson_tree, binary_type)): + return yson_tree.decode("utf-8") + elif isinstance(yson_tree, bool) or isinstance(yson_tree, YsonBoolean): + return True if yson_tree else False + else: + if type(yson_tree) is YsonEntity: + return None + + bases = type(yson_tree).__bases__ + iter = 0 + while len(bases) == 1 and YsonType not in bases: + bases = bases[0].__bases__ + iter += 1 + + if YsonType in bases: + other_types = list(set(bases) - set([YsonType])) + if not other_types: + raise RuntimeError("Failed to perform yson to json conversion of {!r}".format(yson_tree)) + other = other_types[0] + return other(yson_tree) + return yson_tree diff --git a/yt/python/yt/yson/lexer.py b/yt/python/yt/yson/lexer.py new file mode 100644 index 00000000000..a6b70cd41e2 --- /dev/null +++ b/yt/python/yt/yson/lexer.py @@ -0,0 +1,392 @@ +from . import yson_types +from .yson_token import ( + YsonToken, + TOKEN_STRING, + TOKEN_INT64, + TOKEN_UINT64, + TOKEN_DOUBLE, + TOKEN_BOOLEAN, + TOKEN_HASH, + TOKEN_LEFT_PARENTHESIS, + TOKEN_RIGHT_PARENTHESIS, + TOKEN_COMMA, + TOKEN_COLON, + TOKEN_SEMICOLON, + TOKEN_LEFT_ANGLE, + TOKEN_EQUALS, + TOKEN_RIGHT_ANGLE, + TOKEN_LEFT_BRACKET, + TOKEN_RIGHT_BRACKET, + TOKEN_LEFT_BRACE, + TOKEN_RIGHT_BRACE) + +from .common import ( + raise_yson_error, _ENCODING_SENTINEL, + STRING_MARKER, INT64_MARKER, DOUBLE_MARKER, + FALSE_MARKER, TRUE_MARKER, UINT64_MARKER) + +try: + from yt.packages.six.moves import xrange + from yt.packages.six import int2byte, iterbytes +except ImportError: + from six.moves import xrange + from six import int2byte, iterbytes + +import struct + +_SEEMS_INT64 = int2byte(0) +_SEEMS_UINT64 = int2byte(1) +_SEEMS_DOUBLE = int2byte(2) + +PERCENT_LITERALS = [b"true", b"false", b"nan", b"inf", b"-inf", b"+inf"] +PERCENT_LITERAL_LENGTH = dict((s[0:1], len(s)) for s in PERCENT_LITERALS) +assert len(PERCENT_LITERALS) == len(PERCENT_LITERAL_LENGTH) + + +def _get_numeric_type(string): + for code in iterbytes(string): + ch = int2byte(code) + if ch == b"E" or ch == b"e" or ch == b".": + return _SEEMS_DOUBLE + elif ch == b"u": + return _SEEMS_UINT64 + return _SEEMS_INT64 + + +def _zig_zag_decode(value): + return (value >> 1) ^ -(value & 1) + + +class YsonLexer(object): + def __init__(self, stream, encoding=None, output_buffer=None): + assert (encoding is _ENCODING_SENTINEL) != (output_buffer is None), \ + "Exactly one of encoding and output_buffer parameters must be specified" + + self._line_index = 1 + self._position = 1 + self._offset = 0 + self._stream = stream + self._lookahead = None + self._encoding = encoding + self._output_buffer = output_buffer + + def _get_start_state(self, ch): + tokens = { + b"#": TOKEN_HASH, + b"(": TOKEN_LEFT_PARENTHESIS, + b")": TOKEN_RIGHT_PARENTHESIS, + b",": TOKEN_COMMA, + b":": TOKEN_COLON, + b";": TOKEN_SEMICOLON, + b"<": TOKEN_LEFT_ANGLE, + b"=": TOKEN_EQUALS, + b">": TOKEN_RIGHT_ANGLE, + b"[": TOKEN_LEFT_BRACKET, + b"]": TOKEN_RIGHT_BRACKET, + b"{": TOKEN_LEFT_BRACE, + b"}": TOKEN_RIGHT_BRACE, + } + return tokens.get(ch) + + def get_next_token(self): + self._skip_whitespaces() + ch = self._peek_char() + if not ch: + return YsonToken() + + if ch == STRING_MARKER: + return YsonToken(value=self._parse_string(), type=TOKEN_STRING) + + elif ch == b"_" or ch == b'"' or ch.isalpha(): + return YsonToken(value=self._parse_string(), type=TOKEN_STRING) + elif ch == INT64_MARKER: + return YsonToken(value=self._parse_binary_int64(), type=TOKEN_INT64) + + elif ch == UINT64_MARKER: + return YsonToken(value=self._parse_binary_uint64(), type=TOKEN_UINT64) + + elif ch == DOUBLE_MARKER: + return YsonToken(value=self._parse_binary_double(), type=TOKEN_DOUBLE) + + elif ch == FALSE_MARKER: + self._expect_char(ch) + return YsonToken(value=self._maybe_value(False), type=TOKEN_BOOLEAN) + + elif ch == TRUE_MARKER: + self._expect_char(ch) + return YsonToken(value=self._maybe_value(True), type=TOKEN_BOOLEAN) + + elif ch == b"%": + value, token_type = self._parse_percent_literal() + return YsonToken(value=self._maybe_value(value), type=token_type) + + elif ch == b"#": + return YsonToken(value=self._parse_entity(), type=TOKEN_HASH) + + elif ch == b"+" or ch == b"-" or ch.isdigit(): + value, token_type = self._parse_numeric() + return YsonToken(value=self._maybe_value(value), type=token_type) + + state = self._get_start_state(ch) + self._read_char() + return YsonToken(value=self._maybe_value(ch), type=state) + + def get_position_info(self): + return self._line_index, self._position, self._offset + + def _maybe_value(self, value): + if self._output_buffer is None: + return value + return None + + def _read_char(self, binary_input=False): + if self._lookahead is None: + result = self._stream.read(1) + else: + result = self._lookahead + self._lookahead = None + + self._offset += 1 + if not binary_input and result == b"\n": + self._line_index += 1 + self._position = 1 + else: + self._position += 1 + + if self._output_buffer is not None: + self._output_buffer.append(ord(result)) + + return result + + def _peek_char(self): + if self._lookahead is not None: + return self._lookahead + self._lookahead = self._stream.read(1) + return self._lookahead + + def _read_binary_chars(self, char_count): + if self._output_buffer is not None: + string = self._stream.read(char_count) + self._position += len(string) + if len(string) != char_count: + raise_yson_error( + "Premature end-of-stream while reading byte {0} out of {1}".format(len(string) + 1, char_count), + self.get_position_info()) + self._output_buffer += string + return string + + result = [] + for i in xrange(char_count): + ch = self._read_char(True) + if not ch: + raise_yson_error( + "Premature end-of-stream while reading byte {0} out of {1}".format(i + 1, char_count), + self.get_position_info()) + result.append(ch) + return b"".join(result) + + def _expect_char(self, expected_ch): + read_ch = self._read_char() + if not read_ch: + raise_yson_error( + 'Premature end-of-stream expecting "{0}" in Yson'.format(expected_ch), + self.get_position_info()) + if read_ch != expected_ch: + raise_yson_error( + 'Found "{0}" while expecting "{1}" in Yson'.format(read_ch, expected_ch), + self.get_position_info()) + + def _skip_whitespaces(self): + while self._peek_char().isspace(): + self._read_char() + + def _read_string(self): + ch = self._peek_char() + if not ch: + raise_yson_error( + "Premature end-of-stream while expecting string literal in Yson", + self.get_position_info()) + if ch == STRING_MARKER: + return self._read_binary_string() + if ch == b'"': + return self._read_quoted_string() + if not ch.isalpha() and not ch == b"_" and not ch == b"%": + raise_yson_error( + "Expecting string literal but found {0} in Yson".format(ch), + self.get_position_info()) + return self._read_unquoted_string() + + def _read_binary_string(self): + self._expect_char(STRING_MARKER) + length = _zig_zag_decode(self._read_varint()) + string = self._read_binary_chars(length) + if self._output_buffer is None: + return self._decode_string(string) + + def _read_varint(self): + count = 0 + result = 0 + read_next = True + while read_next: + ch = self._read_char() + if not ch: + raise_yson_error( + "Premature end-of-stream while reading varinteger in Yson", + self.get_position_info()) + byte = ord(ch) + result |= (byte & 0x7F) << (7 * count) + if result > 2 ** 64 - 1: + raise_yson_error( + "Varinteger is too large for Int64 in Yson", + self.get_position_info()) + count += 1 + read_next = byte & 0x80 != 0 + + return yson_types._YsonIntegerBase(result) + + def _read_quoted_string(self): + self._expect_char(b'"') + if self._output_buffer is None: + result = [] + pending_next_char = False + while True: + ch = self._read_char() + if not ch: + raise_yson_error( + "Premature end-of-stream while reading string literal in Yson", + self.get_position_info()) + if ch == b'"' and not pending_next_char: + break + if self._output_buffer is None: + result.append(ch) + if pending_next_char: + pending_next_char = False + elif ch == b"\\": + pending_next_char = True + if self._output_buffer is None: + return self._decode_string(self._unescape(b"".join(result))) + + def _unescape(self, string): + return string.decode("unicode_escape").encode("latin1") + + def _decode_string(self, string): + assert self._encoding is not _ENCODING_SENTINEL + if self._encoding is not None: + try: + return string.decode(self._encoding) + except UnicodeDecodeError: + proxy = yson_types.YsonStringProxy() + proxy._bytes = string + return proxy + else: + return string + + def _read_unquoted_string(self): + if self._output_buffer is None: + result = [] + while True: + ch = self._peek_char() + if ch and (ch.isalpha() or ch.isdigit() or ch in b"_%-."): + self._read_char() + if self._output_buffer is None: + result.append(ch) + else: + break + if self._output_buffer is None: + return self._decode_string(b"".join(result)) + + def _read_numeric(self): + result = [] + while True: + ch = self._peek_char() + if not ch or not (ch.isdigit() or ch in b"+-.eEu"): + break + self._read_char() + result.append(ch) + if not result: + raise_yson_error( + "Premature end-of-stream while parsing numeric literal in Yson", + self.get_position_info()) + return b"".join(result) + + def _parse_percent_literal(self): + def raise_unexpected(string): + expected = [b"%" + literal for literal in PERCENT_LITERALS] + raise_yson_error( + "Incorrect percent-preceded literal %s, expected one of %s" % (b"%" + string, expected), + self.get_position_info()) + self._expect_char(b"%") + ch = self._peek_char() + if ch not in PERCENT_LITERAL_LENGTH: + raise_unexpected(ch) + string = self._read_binary_chars(PERCENT_LITERAL_LENGTH[ch]) + if string == b"true": + return True, TOKEN_BOOLEAN + elif string == b"false": + return False, TOKEN_BOOLEAN + elif string in PERCENT_LITERALS: + return float(string), TOKEN_DOUBLE + else: + raise_unexpected(string) + + def _parse_entity(self): + self._expect_char(b"#") + return None + + def _parse_string(self): + return self._read_string() + + def _parse_binary_int64(self): + self._expect_char(INT64_MARKER) + varint = self._read_varint() + if self._output_buffer is None: + return _zig_zag_decode(varint) + + def _parse_binary_uint64(self): + self._expect_char(UINT64_MARKER) + varint = self._read_varint() + if self._output_buffer is None: + return yson_types.YsonUint64(varint) + + def _parse_binary_double(self): + self._expect_char(DOUBLE_MARKER) + bytes_ = self._read_binary_chars(struct.calcsize(b"<d")) + if self._output_buffer is None: + return struct.unpack(b"<d", bytes_)[0] + + def _parse_numeric(self): + string = self._read_numeric() + numeric_type = _get_numeric_type(string) + if numeric_type == _SEEMS_INT64: + try: + result = yson_types._YsonIntegerBase(string) + token_type = TOKEN_INT64 + if result > 2 ** 63 - 1 or result < -(2 ** 63): + raise ValueError() + except ValueError: + raise_yson_error( + "Failed to parse Int64 literal {0} in Yson".format(string), + self.get_position_info()) + elif numeric_type == _SEEMS_UINT64: + try: + if string.endswith(b"u"): + string = string[:-1] + else: + raise ValueError() + result = yson_types.YsonUint64(int(string)) + token_type = TOKEN_UINT64 + if result > 2 ** 64 - 1: + raise ValueError() + except ValueError: + raise_yson_error( + "Failed to parse Uint64 literal {0} in Yson".format(string), + self.get_position_info()) + else: + try: + result = float(string) + token_type = TOKEN_DOUBLE + except ValueError: + raise_yson_error( + "Failed to parse Double literal {0} in Yson".format(string), + self.get_position_info()) + return result, token_type diff --git a/yt/python/yt/yson/parser.py b/yt/python/yt/yson/parser.py new file mode 100644 index 00000000000..746ee6806bf --- /dev/null +++ b/yt/python/yt/yson/parser.py @@ -0,0 +1,287 @@ +from . import convert +from .common import raise_yson_error, YsonError, StreamWrap, _ENCODING_SENTINEL +from .tokenizer import YsonTokenizer +from .yson_token import ( + TOKEN_STRING, + TOKEN_INT64, + TOKEN_UINT64, + TOKEN_DOUBLE, + TOKEN_BOOLEAN, + TOKEN_HASH, + TOKEN_SEMICOLON, + TOKEN_LEFT_ANGLE, + TOKEN_EQUALS, + TOKEN_RIGHT_ANGLE, + TOKEN_LEFT_BRACKET, + TOKEN_RIGHT_BRACKET, + TOKEN_LEFT_BRACE, + TOKEN_RIGHT_BRACE, + TOKEN_START_OF_STREAM, + TOKEN_END_OF_STREAM, +) + +try: + from yt.packages.six import PY3, BytesIO, text_type +except ImportError: + from six import PY3, BytesIO, text_type + + +def _is_text_reader(stream): + return type(stream.read(0)) is text_type + + +class YsonParser(object): + def __init__(self, stream, encoding, always_create_attributes): + # COMPAT: Before porting YSON to Python 3 it supported parsing from + # unicode strings. + if _is_text_reader(stream) and PY3: + raise TypeError("Only binary streams are supported by YSON parser") + self._tokenizer = YsonTokenizer(stream, encoding) + self._always_create_attributes = always_create_attributes + self._encoding = encoding + + def _has_attributes(self): + try: + self._tokenizer.parse_next() + except YsonError: + return False + return self._tokenizer.get_current_type() == TOKEN_LEFT_ANGLE + + def _parse_attributes(self): + self._tokenizer.get_current_token().expect_type(TOKEN_LEFT_ANGLE) + result = {} + while True: + self._tokenizer.parse_next() + if self._tokenizer.get_current_type() == TOKEN_RIGHT_ANGLE: + break + self._tokenizer.get_current_token().expect_type(TOKEN_STRING) + key = self._tokenizer.get_current_token().get_value() + if not key: + raise_yson_error( + "Empty attribute name in Yson", + self._tokenizer.get_position_info()) + self._tokenizer.parse_next() + self._tokenizer.get_current_token().expect_type(TOKEN_EQUALS) + self._tokenizer.parse_next() + value = self._parse_any() + if key in result: + raise_yson_error( + 'Repeated attribute "{0}" in Yson'.format(key), + self._tokenizer.get_position_info()) + result[key] = value + self._tokenizer.parse_next() + if self._tokenizer.get_current_type() == TOKEN_RIGHT_ANGLE: + break + self._tokenizer.get_current_token().expect_type(TOKEN_SEMICOLON) + self._tokenizer.get_current_token().expect_type(TOKEN_RIGHT_ANGLE) + return result + + def _parse_list(self): + self._tokenizer.get_current_token().expect_type(TOKEN_LEFT_BRACKET) + result = [] + while True: + self._tokenizer.parse_next() + if self._tokenizer.get_current_type() == TOKEN_RIGHT_BRACKET: + break + value = self._parse_any() + result.append(value) + self._tokenizer.parse_next() + if self._tokenizer.get_current_type() == TOKEN_RIGHT_BRACKET: + break + self._tokenizer.get_current_token().expect_type(TOKEN_SEMICOLON) + self._tokenizer.get_current_token().expect_type(TOKEN_RIGHT_BRACKET) + return result + + def _parse_map(self): + self._tokenizer.get_current_token().expect_type(TOKEN_LEFT_BRACE) + result = {} + while True: + self._tokenizer.parse_next() + if self._tokenizer.get_current_type() == TOKEN_RIGHT_BRACE: + break + self._tokenizer.get_current_token().expect_type(TOKEN_STRING) + key = self._tokenizer.get_current_token().get_value() + self._tokenizer.parse_next() + self._tokenizer.get_current_token().expect_type(TOKEN_EQUALS) + self._tokenizer.parse_next() + value = self._parse_any() + if key in result: + raise_yson_error( + 'Duplicate map key "{0}" in YSON'.format(key), + self._tokenizer.get_position_info()) + result[key] = value + self._tokenizer.parse_next() + if self._tokenizer.get_current_type() == TOKEN_RIGHT_BRACE: + break + self._tokenizer.get_current_token().expect_type(TOKEN_SEMICOLON) + self._tokenizer.get_current_token().expect_type(TOKEN_RIGHT_BRACE) + return result + + def _parse_any(self): + if self._tokenizer.get_current_type() == TOKEN_START_OF_STREAM: + self._tokenizer.parse_next() + attributes = None + if self._tokenizer.get_current_type() == TOKEN_LEFT_ANGLE: + attributes = self._parse_attributes() + self._tokenizer.parse_next() + + if self._tokenizer.get_current_type() == TOKEN_END_OF_STREAM: + raise_yson_error( + "Premature end-of-stream in Yson", + self._tokenizer.get_position_info()) + + if self._tokenizer.get_current_type() == TOKEN_LEFT_BRACKET: + result = self._parse_list() + + elif self._tokenizer.get_current_type() == TOKEN_LEFT_BRACE: + result = self._parse_map() + + elif self._tokenizer.get_current_type() == TOKEN_HASH: + result = None + + else: + self._tokenizer.get_current_token().expect_type((TOKEN_BOOLEAN, TOKEN_INT64, TOKEN_UINT64, + TOKEN_STRING, TOKEN_DOUBLE)) + result = self._tokenizer.get_current_token().get_value() + + return convert.to_yson_type( + result, + attributes=attributes, + always_create_attributes=self._always_create_attributes, + encoding=self._encoding, + ) + + def parse(self): + result = self._parse_any() + self._tokenizer.parse_next() + self._tokenizer.get_current_token().expect_type(TOKEN_END_OF_STREAM) + return result + + +class RawYsonParser(object): + def __init__(self, stream): + if _is_text_reader(stream) and PY3: + raise TypeError("Only binary streams are supported by YSON parser") + self._buffer = bytearray() + self._tokenizer = YsonTokenizer(stream, output_buffer=self._buffer) + + def _parse_mapping(self, end_token): + while True: + self._tokenizer.parse_next() + if self._tokenizer.get_current_type() == end_token: + break + self._tokenizer.get_current_token().expect_type(TOKEN_STRING) + self._tokenizer.parse_next() + self._tokenizer.get_current_token().expect_type(TOKEN_EQUALS) + self._tokenizer.parse_next() + self._parse_any() + self._tokenizer.parse_next() + if self._tokenizer.get_current_type() == end_token: + break + self._tokenizer.get_current_token().expect_type(TOKEN_SEMICOLON) + self._tokenizer.get_current_token().expect_type(end_token) + + def _parse_attributes(self): + self._tokenizer.get_current_token().expect_type(TOKEN_LEFT_ANGLE) + self._parse_mapping(TOKEN_RIGHT_ANGLE) + + def _parse_map(self): + self._tokenizer.get_current_token().expect_type(TOKEN_LEFT_BRACE) + self._parse_mapping(TOKEN_RIGHT_BRACE) + + def _parse_list(self): + self._tokenizer.get_current_token().expect_type(TOKEN_LEFT_BRACKET) + while True: + self._tokenizer.parse_next() + if self._tokenizer.get_current_type() == TOKEN_RIGHT_BRACKET: + break + self._parse_any() + self._tokenizer.parse_next() + if self._tokenizer.get_current_type() == TOKEN_RIGHT_BRACKET: + break + self._tokenizer.get_current_token().expect_type(TOKEN_SEMICOLON) + self._tokenizer.get_current_token().expect_type(TOKEN_RIGHT_BRACKET) + + def _parse_any(self): + if self._tokenizer.get_current_type() == TOKEN_START_OF_STREAM: + self._tokenizer.parse_next() + + if self._tokenizer.get_current_type() == TOKEN_LEFT_ANGLE: + self._parse_attributes() + self._tokenizer.parse_next() + + if self._tokenizer.get_current_type() == TOKEN_END_OF_STREAM: + raise_yson_error( + "Premature end-of-stream in Yson", + self._tokenizer.get_position_info()) + + if self._tokenizer.get_current_type() == TOKEN_LEFT_BRACKET: + self._parse_list() + + elif self._tokenizer.get_current_type() == TOKEN_LEFT_BRACE: + self._parse_map() + + elif self._tokenizer.get_current_type() == TOKEN_HASH: + pass + + else: + self._tokenizer.get_current_token().expect_type((TOKEN_BOOLEAN, TOKEN_INT64, TOKEN_UINT64, + TOKEN_STRING, TOKEN_DOUBLE)) + + def _flush_buffer(self): + res = bytes(self._buffer) + self._buffer[:] = b'' + return res + + def parse(self): + while self._tokenizer.get_current_type() != TOKEN_END_OF_STREAM: + self._parse_any() + self._tokenizer.parse_next() + self._tokenizer.get_current_token().expect_type(TOKEN_SEMICOLON) + yield self._flush_buffer() + self._tokenizer.parse_next() + + +def load(stream, yson_type=None, always_create_attributes=True, raw=None, + encoding=_ENCODING_SENTINEL, lazy=False): + """Deserializes object from YSON formatted stream `stream`. + + :param str yson_type: type of YSON, one of ["node", "list_fragment", "map_fragment"]. + """ + if lazy: + raise YsonError("Lazy parsing is not supported in python parser") + + if raw: + if yson_type != "list_fragment": + raise YsonError("Raw mode is only supported for list fragments") + return RawYsonParser(stream).parse() + + if not PY3 and encoding is not _ENCODING_SENTINEL and encoding is not None: + raise YsonError("Encoding parameter is not supported for Python 2") + + if encoding is _ENCODING_SENTINEL: + if PY3: + encoding = "utf-8" + else: + encoding = None + + if yson_type == "list_fragment": + stream = StreamWrap(stream, b"[", b"]") + elif yson_type == "map_fragment": + stream = StreamWrap(stream, b"{", b"}") + else: + if yson_type is not None: + raise YsonError("Unexpected yson type: {0!r}".format(yson_type)) + + parser = YsonParser(stream, encoding, always_create_attributes) + return parser.parse() + + +def loads(string, yson_type=None, always_create_attributes=True, raw=None, + encoding=_ENCODING_SENTINEL, lazy=False): + """Deserializes object from YSON formatted string `string`. See :func:`load <.load>`.""" + if type(string) is text_type and PY3: + raise TypeError("Only binary streams are supported by YSON parser") + return load(BytesIO(string), yson_type=yson_type, + always_create_attributes=always_create_attributes, + raw=raw, encoding=encoding, lazy=lazy) diff --git a/yt/python/yt/yson/tokenizer.py b/yt/python/yt/yson/tokenizer.py new file mode 100644 index 00000000000..eb4d5a77a92 --- /dev/null +++ b/yt/python/yt/yson/tokenizer.py @@ -0,0 +1,21 @@ +from .yson_token import YsonToken, TOKEN_START_OF_STREAM +from .lexer import YsonLexer +from .common import _ENCODING_SENTINEL + + +class YsonTokenizer(object): + def __init__(self, input_str, encoding=_ENCODING_SENTINEL, output_buffer=None): + self._token = YsonToken(type=TOKEN_START_OF_STREAM) + self._lexer = YsonLexer(input_str, encoding=encoding, output_buffer=output_buffer) + + def parse_next(self): + self._token = self._lexer.get_next_token() + + def get_current_token(self): + return self._token + + def get_current_type(self): + return self.get_current_token().get_type() + + def get_position_info(self): + return self._lexer.get_position_info() diff --git a/yt/python/yt/yson/writer.py b/yt/python/yt/yson/writer.py new file mode 100644 index 00000000000..5b0511a0236 --- /dev/null +++ b/yt/python/yt/yson/writer.py @@ -0,0 +1,442 @@ +# -*- coding: utf-8 -*- + +"""``yson`` exposes an API familiar to users of the standard library +:mod:`marshal` and :mod:`pickle` modules. + +Serializable types (rules applied top to bottom): +int, long -> yson int +str -> yson string +unicode -> yson string (using specified encoding) +any Mapping -> yson dict +any Iterable -> yson list + +Simple examples:: + + >>> import yson + >>> b = [4, 5, 6] + >>> print yson.dumps({"a" : b, "b" : b}, indent=" ") + { + "a" : [ + 4; + 5; + 6; + ]; + "b" : [ + 4; + 5; + 6; + ]; + } + >>> print yson.dumps(("a", "a"), indent=" ") + [ + "a"; + "a"; + ] + >>> print yson.dumps(123456) + 123456 + >>> print yson.dumps(u'"Hello world!" -- "Превед, медвед!"') + "\"Hello world!\" -- \"\xd0\x9f\xd1\x80\xd0\xb5\xd0\xb2\xd0\xb5\xd0\xb4, \xd0\xbc\xd0\xb5\xd0\xb4\xd0\xb2\xd0\xb5\xd0\xb4!\"" +""" + +from .common import (YsonError, + STRING_MARKER, INT64_MARKER, DOUBLE_MARKER, + FALSE_MARKER, TRUE_MARKER, UINT64_MARKER) +from . import yson_types + +try: + from yt.packages.six.moves import map as imap + from yt.packages.six import (integer_types, text_type, binary_type, + iteritems, iterkeys, iterbytes, PY3) +except ImportError: + from six.moves import map as imap + from six import (integer_types, text_type, binary_type, + iteritems, iterkeys, iterbytes, PY3) + +import math +import struct +# Python3 compatibility +try: + from collections.abc import Iterable, Mapping +except ImportError: + from collections import Iterable, Mapping + +__all__ = ["dump", "dumps"] + + +def _is_hex_digit(c): + return ord(b'0') <= c <= ord(b'9') or ord(b'A') <= c <= ord(b'F') or ord(b'a') <= c <= ord(b'f') + + +def _is_oct_digit(c): + return ord(b'0') <= c <= ord(b'7') + + +def _escape_byte(c, nxt, r): + if c == ord(b"\""): + r += b"\\\"" + elif c == ord(b"\\"): + r += b"\\\\" + elif 32 <= c <= 126: + r.append(c) + elif c == ord(b"\r"): + r += b"\\r" + elif c == ord(b"\n"): + r += b"\\n" + elif c == ord(b"\t"): + r += b"\\t" + elif c < 8 and not _is_oct_digit(nxt): + r += '\\{}'.format(c).encode("ascii") + elif not _is_hex_digit(nxt): + r += '\\x{:02X}'.format(c).encode("ascii") + else: + r += '\\{:03o}'.format(c).encode("ascii") + + +def _escape_bytes(obj): + if len(obj) == 0: + return b"" + + res = bytearray() + iterator = iterbytes(obj) + cur = next(iterator) + for nxt in iterator: + _escape_byte(cur, nxt, res) + cur = nxt + _escape_byte(cur, ord(b" "), res) + return bytes(res) + + +def dump(object, stream, yson_format=None, yson_type=None, indent=None, + ignore_inner_attributes=False, encoding="utf-8", sort_keys=False, + check_circular=True): + """Serializes `object` as a YSON formatted stream to `stream`. + + :param str yson_format: format of YSON, one of ["binary", "text", "pretty"]. + :param str yson_type: type of YSON, one of ["node", "list_fragment", "map_fragment"]. + :param int indent: number of indentation spaces in pretty format. + :param bool ignore_inner_attributes: skip attributes of non-top-level values. + :param str encoding: encoding that uses to encode unicode strings. + :param bool sort_keys: if True, mapping items are printed in sorted order. + :param bool check_circular: prevent the attempt to serialize an object with reference loop. + """ + + stream.write(dumps(object, yson_format=yson_format, yson_type=yson_type, indent=indent, + ignore_inner_attributes=ignore_inner_attributes, + encoding=encoding, sort_keys=sort_keys, + check_circular=check_circular)) + + +class YsonContext(object): + def __init__(self): + self.path_parts = [] + self.row_index = None + + def push(self, key_or_index): + self.path_parts.append(key_or_index) + + def pop(self): + self.path_parts.pop() + + +def _raise_error_with_context(message, context): + attributes = {} + if context.row_index is not None: + attributes["row_index"] = context.row_index + + path_parts = imap(str, context.path_parts) + if context.path_parts: + attributes["row_key_path"] = "/" + "/".join(path_parts) + raise YsonError(message, attributes=attributes) + + +def _zig_zag_encode(value): + return (value >> 63) ^ (value << 1) + + +def _dump_varint(value): + assert 0 <= value <= 2 ** 64 - 1 + result = bytearray() + while value >= 0x80: + result.append(0x80 | (value & 0x7F)) + value >>= 7 + result.append(value) + return bytes(result) + + +def dumps(object, yson_format=None, yson_type=None, indent=None, + ignore_inner_attributes=False, encoding="utf-8", sort_keys=False, + check_circular=True): + """Serializes `object` as a YSON formatted stream to string and returns it. See :func:`dump <.dump>`.""" + if indent is None: + indent = 4 + if isinstance(indent, int): + indent = b" " * indent + if yson_format is None: + yson_format = "text" + if yson_format not in ("pretty", "text", "binary"): + raise YsonError("{0} format is not supported".format(yson_format)) + if yson_format in ("text", "binary"): + indent = None + if yson_type is not None: + if yson_type not in ["list_fragment", "map_fragment", "node"]: + raise YsonError("YSON type {0} is not supported".format(yson_type)) + else: + yson_type = "node" + + is_text = yson_format in ("text", "pretty") + d = Dumper(check_circular, encoding, indent, yson_type, sort_keys, + is_text=is_text, ignore_inner_attributes=ignore_inner_attributes) + return d.dumps(object, YsonContext()) + + +class Dumper(object): + def __init__(self, check_circular, encoding, indent, yson_type, sort_keys, + is_text, ignore_inner_attributes): + self.yson_type = yson_type + + self._seen_objects = None + if check_circular: + self._seen_objects = {} + + self._encoding = encoding + self._format = FormatDetails(indent, sort_keys) + if yson_type == "node": + self._level = -1 + else: + self._level = -2 # Stream elements are one level deep, but need not be indented + + self._is_text = is_text + self._ignore_inner_attributes = ignore_inner_attributes + + def _has_attributes(self, obj): + if hasattr(obj, "has_attributes"): + return obj.has_attributes() + return hasattr(obj, "attributes") + + def dumps(self, obj, context): + if hasattr(obj, "to_yson_type") and callable(obj.to_yson_type): + return self.dumps(obj.to_yson_type(), context) + self._level += 1 + attributes = b"" + if self._has_attributes(obj) and (not self._ignore_inner_attributes or self._level == 0): + if not isinstance(obj.attributes, dict): + _raise_error_with_context('Invalid field "attributes": it must be string or None', context) + if obj.attributes: + attributes = self._dump_attributes(obj.attributes, context) + + result = None + if obj is False or (isinstance(obj, yson_types.YsonBoolean) and not obj): + if self._is_text: + result = b"%false" + else: + result = FALSE_MARKER + elif obj is True or (isinstance(obj, yson_types.YsonBoolean) and obj): + if self._is_text: + result = b"%true" + else: + result = TRUE_MARKER + elif isinstance(obj, integer_types): + if obj < -2 ** 63 or obj >= 2 ** 64: + _raise_error_with_context("Integer {0} cannot be represented in YSON " + "since it is out of range [-2^63, 2^64 - 1])".format(obj), context) + + greater_than_max_int64 = obj >= 2 ** 63 + if isinstance(obj, yson_types.YsonUint64) and obj < 0: + _raise_error_with_context("Can not dump negative integer as YSON uint64", context) + if isinstance(obj, yson_types.YsonInt64) and greater_than_max_int64: + _raise_error_with_context("Can not dump integer greater than 2^63-1 as YSON int64", context) + + result = self._dump_integer(obj, greater_than_max_int64) + elif isinstance(obj, float): + result = self._dump_float(obj) + elif isinstance(obj, (text_type, binary_type, yson_types.YsonStringProxy)): + result = self._dump_string(obj, context) + elif isinstance(obj, Mapping): + result = self._dump_map(obj, context) + elif isinstance(obj, Iterable): + result = self._dump_list(obj, context) + elif isinstance(obj, yson_types.YsonEntity) or obj is None: + result = b"#" + else: + _raise_error_with_context("{0!r} is not Yson serializable".format(obj), context) + self._level -= 1 + return attributes + result + + def _dump_integer(self, obj, force_uint64): + if self._is_text: + if isinstance(obj, (yson_types.YsonInt64, yson_types.YsonUint64)): + obj_str = str(yson_types._YsonIntegerBase(obj)) + else: + obj_str = str(obj) + + result = obj_str.encode("ascii") + if not PY3: + result = result.rstrip(b"L") + if force_uint64 or isinstance(obj, yson_types.YsonUint64): + result += b"u" + else: + if force_uint64 or isinstance(obj, yson_types.YsonUint64): + result = UINT64_MARKER + _dump_varint(obj) + else: + result = INT64_MARKER + _dump_varint(_zig_zag_encode(obj)) + return result + + def _dump_float(self, obj): + if self._is_text: + if math.isnan(obj): + result = b"%nan" + elif math.isinf(obj): + if obj > 0: + result = b"%inf" + else: + result = b"%-inf" + else: + if type(obj) == yson_types.YsonDouble: + obj_str = str(float(obj)) + else: + obj_str = str(obj) + result = obj_str.encode("ascii") + else: + result = DOUBLE_MARKER + struct.pack("<d", obj) + return result + + def _dump_string(self, obj, context): + if isinstance(obj, binary_type): + result = obj + elif isinstance(obj, text_type): + if self._encoding is None: + _raise_error_with_context('Cannot encode unicode object {0!r} to bytes since "encoding" ' + 'parameter is None. Consider using byte strings ' + 'instead or specify encoding'.format(obj), + context) + result = obj.encode(self._encoding) + elif isinstance(obj, yson_types.YsonStringProxy): + result = obj._bytes + else: + assert False + if self._is_text: + return b"".join([b'"', _escape_bytes(result), b'"']) + else: + encoded_len = _zig_zag_encode(len(result)) + return b"".join([STRING_MARKER, _dump_varint(encoded_len), result]) + + def _dump_map(self, obj, context): + is_stream = self.yson_type == "map_fragment" and self._level == -1 + result = [] + if not is_stream: + result += [b"{", self._format.nextline()] + + for k, v in self._format.mapping_iter(obj): + if not isinstance(k, (text_type, binary_type, yson_types.YsonStringProxy)): + _raise_error_with_context("Only string can be Yson map key. Key: {0!r}".format(k), context) + + @self._circular_check(v) + def process_item(): + context.push(k) + item = [self._format.prefix(self._level + 1), + self._dump_string(k, context), self._format.space(), b"=", + self._format.space(), self.dumps(v, context), b";", self._format.nextline(is_stream)] + context.pop() + return item + + result += process_item() + + if not is_stream: + result += [self._format.prefix(self._level), b"}"] + + return b"".join(result) + + def _dump_list(self, obj, context): + is_stream = self.yson_type == "list_fragment" and self._level == -1 + result = [] + if not is_stream: + result += [b"[", self._format.nextline()] + + for index, v in enumerate(obj): + @self._circular_check(v) + def process_item(): + if is_stream: + context.row_index = index + else: + context.push(index) + item = [self._format.prefix(self._level + 1), + self.dumps(v, context), b";", self._format.nextline(is_stream)] + if not is_stream: + context.pop() + return item + + result += process_item() + + if not is_stream: + result += [self._format.prefix(self._level), b"]"] + + return b"".join(result) + + def _dump_attributes(self, obj, context): + result = [b"<", self._format.nextline()] + for k, v in obj.items(): + if not isinstance(k, (text_type, binary_type)): + _raise_error_with_context("Only string can be Yson map key. Key: {0!r}".format(obj), context) + + @self._circular_check(v) + def process_item(): + context.push("@" + k) + item = [self._format.prefix(self._level + 1), + self._dump_string(k, context), self._format.space(), b"=", + self._format.space(), self.dumps(v, context), b";", self._format.nextline()] + context.pop() + return item + + result += process_item() + result += [self._format.prefix(self._level), b">"] + return b"".join(result) + + def _circular_check(self, obj): + def decorator(fn): + def wrapper(*args, **kwargs): + obj_id = None + if self._seen_objects is not None: + obj_id = id(obj) + if obj_id in self._seen_objects: + raise YsonError("Circular reference detected. Object: {0!r}".format(obj)) + else: + self._seen_objects[obj_id] = obj + + result = fn(*args, **kwargs) + + if self._seen_objects: + del self._seen_objects[obj_id] + return result + + return wrapper + return decorator + + +class FormatDetails(object): + def __init__(self, indent, sort_keys=False): + self._indent = indent + self._sort_keys = sort_keys + + def prefix(self, level): + if self._indent: + return b"".join([self._indent] * level) + else: + return b"" + + def nextline(self, force=False): + if force or self._indent: + return b"\n" + else: + return b"" + + def space(self): + if self._indent: + return b" " + else: + return b"" + + def mapping_iter(self, mapping): + if self._sort_keys: + return ((key, mapping[key]) for key in sorted(iterkeys(mapping))) + else: + return iteritems(mapping) diff --git a/yt/python/yt/yson/ya.make b/yt/python/yt/yson/ya.make new file mode 100644 index 00000000000..37bf8a4046a --- /dev/null +++ b/yt/python/yt/yson/ya.make @@ -0,0 +1,23 @@ +PY23_LIBRARY() + +PEERDIR( + yt/python/yt + + contrib/python/six +) + +PY_SRCS( + NAMESPACE yt.yson + + __init__.py + common.py + convert.py + lexer.py + parser.py + tokenizer.py + writer.py + yson_token.py + yson_types.py +) + +END() diff --git a/yt/python/yt/yson/yson_token.py b/yt/python/yt/yson/yson_token.py new file mode 100644 index 00000000000..ef230f72031 --- /dev/null +++ b/yt/python/yt/yson/yson_token.py @@ -0,0 +1,155 @@ +from .common import YsonError + +from yt.common import flatten + +try: + from yt.packages.six.moves import map as imap + from yt.packages.six import PY3 +except ImportError: + from six.moves import map as imap + from six import PY3 + +import string + + +TOKEN_LITERAL = 0 +TOKEN_SLASH = 1 +TOKEN_AMPERSAND = 2 +TOKEN_AT = 3 +TOKEN_ASTERISK = 4 +TOKEN_START_OF_STREAM = 5 +TOKEN_END_OF_STREAM = 6 +TOKEN_RANGE = 7 +TOKEN_SEMICOLON = 8 +TOKEN_EQUALS = 9 +TOKEN_LEFT_BRACE = 10 +TOKEN_RIGHT_BRACE = 11 +TOKEN_HASH = 12 +TOKEN_LEFT_BRACKET = 13 +TOKEN_RIGHT_BRACKET = 14 +TOKEN_LEFT_ANGLE = 15 +TOKEN_RIGHT_ANGLE = 16 +TOKEN_LEFT_PARENTHESIS = 17 +TOKEN_RIGHT_PARENTHESIS = 18 +TOKEN_COLON = 19 +TOKEN_COMMA = 20 +TOKEN_STRING = 21 +TOKEN_INT64 = 22 +TOKEN_UINT64 = 23 +TOKEN_DOUBLE = 24 +TOKEN_BOOLEAN = 25 +TOKEN_SPECIAL = 26 + +CHAR_TO_TOKEN_TYPE = { + ";": TOKEN_SEMICOLON, + "=": TOKEN_EQUALS, + "{": TOKEN_LEFT_BRACE, + "}": TOKEN_RIGHT_BRACE, + "#": TOKEN_HASH, + "[": TOKEN_LEFT_BRACKET, + "]": TOKEN_RIGHT_BRACKET, + "<": TOKEN_LEFT_ANGLE, + ">": TOKEN_RIGHT_ANGLE, + "(": TOKEN_LEFT_PARENTHESIS, + ")": TOKEN_RIGHT_PARENTHESIS, + ":": TOKEN_COLON, + ",": TOKEN_COMMA, + "/": TOKEN_SLASH, + "@": TOKEN_AT, + "&": TOKEN_AMPERSAND, + "*": TOKEN_ASTERISK +} + + +def char_to_token_type(char_or_byte): + if PY3: + char_or_byte = chr(char_or_byte) + if char_or_byte not in CHAR_TO_TOKEN_TYPE: + return TOKEN_END_OF_STREAM + return CHAR_TO_TOKEN_TYPE[char_or_byte] + + +def token_type_to_string(token): + names = { + TOKEN_LITERAL: "Literal", + TOKEN_SLASH: "Slash", + TOKEN_AMPERSAND: "Ampersand", + TOKEN_AT: "At", + TOKEN_ASTERISK: "Asterisk", + TOKEN_START_OF_STREAM: "Start-of-stream", + TOKEN_END_OF_STREAM: "End-of-stream", + TOKEN_RANGE: "Range", + TOKEN_SEMICOLON: "Semicolon", + TOKEN_EQUALS: "Equals", + TOKEN_LEFT_BRACE: "Left-brace", + TOKEN_RIGHT_BRACE: "Right-brace", + TOKEN_HASH: "Hash", + TOKEN_LEFT_BRACKET: "Left-bracket", + TOKEN_RIGHT_BRACKET: "Right-bracket", + TOKEN_LEFT_ANGLE: "Left-angle", + TOKEN_RIGHT_ANGLE: "Right-angle", + TOKEN_LEFT_PARENTHESIS: "Left-parenthesis", + TOKEN_RIGHT_PARENTHESIS: "Right-parenthesis", + TOKEN_COLON: "Colon", + TOKEN_COMMA: "Comma", + TOKEN_STRING: "String", + TOKEN_INT64: "Int64", + TOKEN_UINT64: "Uint64", + TOKEN_DOUBLE: "Double", + TOKEN_BOOLEAN: "Boolean", + TOKEN_SPECIAL: "Special", + } + if token is None: + return "Unknown" + return names[token] + + +def decode_token_value(value): + if not PY3 or not isinstance(value, bytes): + return value + + chars = [] + for byte in value: + char = chr(byte) + if char in string.printable: # whitespaces cannot present in token + chars.append(char) + else: + chars.append("\\x" + hex(byte)[2:]) + + return "".join(chars) + + +class YsonToken(object): + def __init__(self, value="", type=TOKEN_END_OF_STREAM): + self._value = value + self._type = type + + def get_value(self): + return self._value + + def get_type(self): + return self._type + + def _raise_error(self, message_end_of_stream, message_unexpected_token, token_type, value, expected_type): + if token_type == TOKEN_END_OF_STREAM: + raise YsonError(message_end_of_stream.format(expected_type)) + else: + raise YsonError(message_unexpected_token.format(value, token_type_to_string(token_type), expected_type)) + + def expect_type(self, type_or_types): + token_type = self.get_type() + expected_types = flatten(type_or_types) + if token_type is None: + raise YsonError('Unexpected "{0}" while parsing node'.format(decode_token_value(self.get_value()))) + + if token_type not in expected_types: + if token_type == TOKEN_END_OF_STREAM: + raise YsonError("Unexpected end of stream; expected types are {0}".format(expected_types)) + else: + raise YsonError('Unexpected token "{0}" of type {1}; ' + 'expected types are {2}'.format(decode_token_value(self.get_value()), + token_type_to_string(token_type), + list(imap(token_type_to_string, expected_types)))) + + def __str__(self): + return str(self._value) diff --git a/yt/python/yt/yson/yson_types.py b/yt/python/yt/yson/yson_types.py new file mode 100644 index 00000000000..00dbcc55477 --- /dev/null +++ b/yt/python/yt/yson/yson_types.py @@ -0,0 +1,331 @@ +try: + from yt.packages.six import PY3, integer_types, binary_type, text_type +except ImportError: + from six import PY3, integer_types, binary_type, text_type + +from yt.common import YtError + + +class YsonType(object): + def __getattr__(self, attribute): + if attribute == "attributes": + self.__dict__[attribute] = {} + return self.__dict__[attribute] + raise AttributeError('Attribute "{0}" not found'.format(attribute)) + + def has_attributes(self): + try: + return "attributes" in self.__dict__ and self.attributes is not None and self.attributes != {} + except: # noqa + return False + + def __eq__(self, other): + try: + has_attributes = other.has_attributes() + except AttributeError: + has_attributes = False + if has_attributes: + return self.attributes == other.attributes + return not self.has_attributes() + + def __ne__(self, other): + return not (self == other) + + def to_str(self, base_type, str_func): + if self.has_attributes(): + return str_func({"value": base_type(self), "attributes": self.attributes}) + return str_func(base_type(self)) + + def base_hash(self, type_): + if self.has_attributes(): + raise TypeError("unhashable type: YSON has non-trivial attributes") + return hash(type_(self)) + + +class YsonString(binary_type, YsonType): + def __eq__(self, other): + # COMPAT: With implicit promotion of str to unicode it can make sense + # to compare binary YsonString to unicode string. + if not isinstance(other, (binary_type, text_type)): + return NotImplemented + return binary_type(self) == binary_type(other) and YsonType.__eq__(self, other) + + def __ne__(self, other): + return not (self == other) + + def __hash__(self): + return self.base_hash(binary_type) + + def __repr__(self): + return self.to_str(binary_type, repr) + + +class YsonUnicode(text_type, YsonType): + def __eq__(self, other): + if not isinstance(other, text_type): + return NotImplemented + return text_type(self) == text_type(other) and YsonType.__eq__(self, other) + + def __ne__(self, other): + return not (self == other) + + def __hash__(self): + return self.base_hash(text_type) + + def __repr__(self): + return self.to_str(text_type, repr) + + +class NotUnicodeError(YtError, TypeError): + pass + + +def _truncate(s, length=50): + assert isinstance(s, bytes) + if len(s) < length: + return s + return s[:length] + b"..." + + +def _make_raise_not_unicode_error(name): + def fun(self, *args, **kwargs): + raise NotUnicodeError('Method "{}" is not allowed: YSON string "{}" ' + "could not be decoded to Unicode, " + "see https://ytsaurus.tech/docs/en/api/python/userdoc#python3_strings" + .format(name, _truncate(self._bytes))) + return fun + + +def proxy(cls): + ALLOWED_METHODS = [ + "get_bytes", + "is_unicode", + "__hash__", + "__eq__", + "__ne__", + "__repr__", + "__format__", + "__dict__", + "__qualname__", + "__class__", + "__mro__", + "__new__", + "__init__", + "__getattr__", + "__setattr__", + "__getattribute__", + "__copy__", + "__deepcopy__", + ] + + ADDITIONAL_METHODS = [ + "__radd__", + ] + + for name in dir(text_type): + attr = getattr(text_type, name) + if callable(attr) and name not in ALLOWED_METHODS: + setattr(cls, name, _make_raise_not_unicode_error(name)) + for name in ADDITIONAL_METHODS: + setattr(cls, name, _make_raise_not_unicode_error(name)) + return cls + + +# NB: This class is never returned by library in Python2. +# NB: Don't create this class by hand, it should only be returned +# from the library. +@proxy +class YsonStringProxy(YsonType): + def __repr__(self): + value = "<YsonStringProxy>{!r}".format(self._bytes) + if self.has_attributes(): + return repr({"attributes": self.attributes, "value": value}) + return value + + def __format__(self, format_spec): + return repr(self) + + def __copy__(self): + return self + + def __deepcopy__(self, memo): + return self + + def __hash__(self): + return hash(self._bytes) + + def __eq__(self, other): + if isinstance(other, bytes): + return self._bytes == bytes(other) and YsonType.__eq__(self, other) + elif isinstance(other, YsonStringProxy): + return self._bytes == other._bytes and YsonType.__eq__(self, other) + else: + return NotImplemented + + def __ne__(self, other): + return not (self == other) + + +def is_unicode(x): + return isinstance(x, text_type) + + +def get_bytes(x, encoding="utf8"): + if isinstance(x, text_type): + return x.encode(encoding) + elif isinstance(x, YsonStringProxy): + return x._bytes + elif isinstance(x, binary_type): + return x + else: + raise TypeError("get_bytes() expected str, bytes or YsonStringProxy, got <{}>{!r}" + .format(type(x), x)) + + +def make_byte_key(s): + proxy = YsonStringProxy() + proxy._bytes = s + return proxy + + +if PY3: + _YsonIntegerBase = int +else: + _YsonIntegerBase = long # noqa + + +class YsonIntegerBase(_YsonIntegerBase, YsonType): + def __eq__(self, other): + if not isinstance(other, integer_types): + return NotImplemented + return _YsonIntegerBase(self) == _YsonIntegerBase(other) and YsonType.__eq__(self, other) + + def __ne__(self, other): + return not (self == other) + + def __hash__(self): + return self.base_hash(_YsonIntegerBase) + + def __repr__(self): + return self.to_str(_YsonIntegerBase, repr) + + def __str__(self): + return self.to_str(_YsonIntegerBase, str) + + +class YsonInt64(YsonIntegerBase): + pass + + +class YsonUint64(YsonIntegerBase): + pass + + +class YsonDouble(float, YsonType): + def __eq__(self, other): + if not isinstance(other, float): + return NotImplemented + return float(self) == float(other) and YsonType.__eq__(self, other) + + def __ne__(self, other): + return not (self == other) + + def __hash__(self): + return self.base_hash(float) + + def __repr__(self): + return self.to_str(float, repr) + + def __str__(self): + return self.to_str(float, str) + + +class YsonBoolean(int, YsonType): + def __eq__(self, other): + if not isinstance(other, int): + return NotImplemented + return (int(self) == 0) == (int(other) == 0) and YsonType.__eq__(self, other) + + def __ne__(self, other): + return not (self == other) + + def __hash__(self): + return self.base_hash(bool) + + # NB: do not change this representation, because + # this type required to be JSON serializable. + # JSON encoder thinks that it is integer and calls str. + def __repr__(self): + return "true" if self else "false" + + def __str__(self): + return self.__repr__() + + +class YsonList(list, YsonType): + def __eq__(self, other): + if not isinstance(other, list): + return NotImplemented + return list(self) == list(other) and YsonType.__eq__(self, other) + + def __ne__(self, other): + return not (self == other) + + def __hash__(self): + raise TypeError('unhashable type "YsonList"') + + def __repr__(self): + return self.to_str(list, repr) + + def __str__(self): + return self.to_str(list, str) + + +class YsonMap(dict, YsonType): + def __eq__(self, other): + if not isinstance(other, dict): + return NotImplemented + return dict(self) == dict(other) and YsonType.__eq__(self, other) + + def __ne__(self, other): + return not (self == other) + + def __hash__(self): + raise TypeError('unhashable type "YsonMap"') + + def __repr__(self): + return self.to_str(dict, repr) + + def __str__(self): + return self.to_str(dict, str) + + +class YsonEntity(YsonType): + def __init__(self, value=None): + if value is not None: + assert isinstance(value, YsonEntity) + self.attributes = value.attributes + + def __eq__(self, other): + if other is None and not self.attributes: + return True + if not isinstance(other, YsonEntity): + return NotImplemented + return YsonType.__eq__(self, other) + + def __ne__(self, other): + return not (self == other) + + def __bool__(self): + return False + + def __repr__(self): + if self.attributes: + return repr({"value": "YsonEntity", "attributes": self.attributes}) + else: + return "YsonEntity" + + def __str__(self): + return self.__repr__() + + __nonzero__ = __bool__ |
