diff options
| author | YDBot <[email protected]> | 2025-10-02 00:52:12 +0000 |
|---|---|---|
| committer | YDBot <[email protected]> | 2025-10-02 00:52:12 +0000 |
| commit | 197d32948a067cc65bac500c343f548bb2d5855d (patch) | |
| tree | a144cab09544e9191308fed88135c189392b2da5 /contrib/python | |
| parent | c2fb1c1f0da6856bd3569a4828abdaf44d60aabf (diff) | |
| parent | df4ccba082c7c4afe8a610a94c168c533d11f80a (diff) | |
Sync branches 251002-0050
Diffstat (limited to 'contrib/python')
41 files changed, 1736 insertions, 149 deletions
diff --git a/contrib/python/clickhouse-connect/.dist-info/METADATA b/contrib/python/clickhouse-connect/.dist-info/METADATA index 39a36b2075e..c2d9217601d 100644 --- a/contrib/python/clickhouse-connect/.dist-info/METADATA +++ b/contrib/python/clickhouse-connect/.dist-info/METADATA @@ -1,6 +1,6 @@ Metadata-Version: 2.4 Name: clickhouse-connect -Version: 0.8.18 +Version: 0.9.2 Summary: ClickHouse Database Core Driver for Python, Pandas, and Superset Home-page: https://github.com/ClickHouse/clickhouse-connect Author: ClickHouse Inc. @@ -10,13 +10,12 @@ Keywords: clickhouse,superset,sqlalchemy,http,driver Classifier: Development Status :: 4 - Beta Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: Apache Software License -Classifier: Programming Language :: Python :: 3.8 Classifier: Programming Language :: Python :: 3.9 Classifier: Programming Language :: Python :: 3.10 Classifier: Programming Language :: Python :: 3.11 Classifier: Programming Language :: Python :: 3.12 Classifier: Programming Language :: Python :: 3.13 -Requires-Python: ~=3.8 +Requires-Python: >=3.9,<3.14 Description-Content-Type: text/markdown License-File: LICENSE Requires-Dist: certifi @@ -25,11 +24,13 @@ Requires-Dist: pytz Requires-Dist: zstandard Requires-Dist: lz4 Provides-Extra: sqlalchemy -Requires-Dist: sqlalchemy<2.0,>1.3.21; extra == "sqlalchemy" +Requires-Dist: sqlalchemy<3.0,>=1.4.40; extra == "sqlalchemy" Provides-Extra: numpy Requires-Dist: numpy; extra == "numpy" Provides-Extra: pandas Requires-Dist: pandas; extra == "pandas" +Provides-Extra: polars +Requires-Dist: polars>=1.0; extra == "polars" Provides-Extra: arrow Requires-Dist: pyarrow; extra == "arrow" Provides-Extra: orjson @@ -54,11 +55,12 @@ Dynamic: summary A high performance core database driver for connecting ClickHouse to Python, Pandas, and Superset -* Pandas DataFrames +* Pandas DataFrames (numpy and arrow-backed) * Numpy Arrays * PyArrow Tables +* Polars DataFrames * Superset Connector -* SQLAlchemy 1.3 and 1.4 (limited feature set) +* SQLAlchemy Core (select, joins, lightweight deletes; limited feature set) ClickHouse Connect currently uses the ClickHouse HTTP interface for maximum compatibility. @@ -68,7 +70,7 @@ ClickHouse Connect currently uses the ClickHouse HTTP interface for maximum comp pip install clickhouse-connect ``` -ClickHouse Connect requires Python 3.8 or higher. +ClickHouse Connect requires Python 3.9 or higher. We officially test against Python 3.9 through 3.13. ### Superset Connectivity @@ -82,9 +84,17 @@ When creating a Superset Data Source, either use the provided connection dialog, ### SQLAlchemy Implementation -ClickHouse Connect incorporates a minimal SQLAlchemy implementation (without any ORM features) for compatibility with -Superset. It has only been tested against SQLAlchemy versions 1.3.x and 1.4.x, and is unlikely to work with more -complex SQLAlchemy applications. +ClickHouse Connect includes a lightweight SQLAlchemy dialect implementation focused on compatibility with **Superset** +and **SQLAlchemy Core**. + +Supported features include: +- Basic query execution via SQLAlchemy Core +- `SELECT` queries with `JOIN`s +- Lightweight `DELETE` statements + +The implementation does not include ORM support and is not intended as a full SQLAlchemy dialect. While it can support +a range of Core-based applications beyond Superset, it may not be suitable for more complex SQLAlchemy applications +that rely on full ORM or advanced dialect functionality. ### Asyncio Support @@ -94,4 +104,4 @@ See the [run_async example](./examples/run_async.py) for more details. ### Complete Documentation The documentation for ClickHouse Connect has moved to -[ClickHouse Docs](https://clickhouse.com/docs/integrations/python) +[ClickHouse Docs](https://clickhouse.com/docs/integrations/python) diff --git a/contrib/python/clickhouse-connect/README.md b/contrib/python/clickhouse-connect/README.md index 83fbbf583ac..8d9e4b8b845 100644 --- a/contrib/python/clickhouse-connect/README.md +++ b/contrib/python/clickhouse-connect/README.md @@ -2,11 +2,12 @@ A high performance core database driver for connecting ClickHouse to Python, Pandas, and Superset -* Pandas DataFrames +* Pandas DataFrames (numpy and arrow-backed) * Numpy Arrays * PyArrow Tables +* Polars DataFrames * Superset Connector -* SQLAlchemy 1.3 and 1.4 (limited feature set) +* SQLAlchemy Core (select, joins, lightweight deletes; limited feature set) ClickHouse Connect currently uses the ClickHouse HTTP interface for maximum compatibility. @@ -16,7 +17,7 @@ ClickHouse Connect currently uses the ClickHouse HTTP interface for maximum comp pip install clickhouse-connect ``` -ClickHouse Connect requires Python 3.8 or higher. +ClickHouse Connect requires Python 3.9 or higher. We officially test against Python 3.9 through 3.13. ### Superset Connectivity @@ -30,9 +31,17 @@ When creating a Superset Data Source, either use the provided connection dialog, ### SQLAlchemy Implementation -ClickHouse Connect incorporates a minimal SQLAlchemy implementation (without any ORM features) for compatibility with -Superset. It has only been tested against SQLAlchemy versions 1.3.x and 1.4.x, and is unlikely to work with more -complex SQLAlchemy applications. +ClickHouse Connect includes a lightweight SQLAlchemy dialect implementation focused on compatibility with **Superset** +and **SQLAlchemy Core**. + +Supported features include: +- Basic query execution via SQLAlchemy Core +- `SELECT` queries with `JOIN`s +- Lightweight `DELETE` statements + +The implementation does not include ORM support and is not intended as a full SQLAlchemy dialect. While it can support +a range of Core-based applications beyond Superset, it may not be suitable for more complex SQLAlchemy applications +that rely on full ORM or advanced dialect functionality. ### Asyncio Support @@ -42,4 +51,4 @@ See the [run_async example](./examples/run_async.py) for more details. ### Complete Documentation The documentation for ClickHouse Connect has moved to -[ClickHouse Docs](https://clickhouse.com/docs/integrations/python) +[ClickHouse Docs](https://clickhouse.com/docs/integrations/python) diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/__version__.py b/contrib/python/clickhouse-connect/clickhouse_connect/__version__.py index 07f89fe72be..b2b90876495 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/__version__.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/__version__.py @@ -1 +1 @@ -version = '0.8.18' +version = "0.9.2" diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/cc_sqlalchemy/datatypes/base.py b/contrib/python/clickhouse-connect/clickhouse_connect/cc_sqlalchemy/datatypes/base.py index ce73da14815..d79b4e52a3c 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/cc_sqlalchemy/datatypes/base.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/cc_sqlalchemy/datatypes/base.py @@ -1,5 +1,5 @@ import logging -from typing import Dict, Type +from typing import Dict, Type, Optional from sqlalchemy.exc import CompileError @@ -106,6 +106,15 @@ class ChSqlaType: """ return self.name + # pylint: disable=unused-argument + def _with_collation(self, collation: Optional[str]) -> "ChSqlaType": + """ + SQLAlchemy 2.x compatibility: TypeEngine declares this abstract to support + text types that can carry a collation. ClickHouse types in this dialect + do not vary by collation, so this is a no-op that returns self. + """ + return self + class CaseInsensitiveDict(dict): def __setitem__(self, key, value): diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/cc_sqlalchemy/datatypes/sqltypes.py b/contrib/python/clickhouse-connect/clickhouse_connect/cc_sqlalchemy/datatypes/sqltypes.py index cc5e8300f69..e42a6755afc 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/cc_sqlalchemy/datatypes/sqltypes.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/cc_sqlalchemy/datatypes/sqltypes.py @@ -3,7 +3,7 @@ from enum import Enum as PyEnum from typing import Type, Union, Sequence from sqlalchemy.types import Integer, Float, Numeric, Boolean as SqlaBoolean, \ - UserDefinedType, String as SqlaString, DateTime as SqlaDateTime, Date as SqlaDate + UserDefinedType, String as SqlaString, DateTime as SqlaDateTime, Date as SqlaDate, Interval from sqlalchemy.exc import ArgumentError from clickhouse_connect.cc_sqlalchemy.datatypes.base import ChSqlaType @@ -78,6 +78,7 @@ class Bool(ChSqlaType, SqlaBoolean): SqlaBoolean.__init__(self) +# pylint: disable=too-many-ancestors class Boolean(Bool): pass @@ -272,6 +273,71 @@ class DateTime64(ChSqlaType, SqlaDateTime): SqlaDateTime.__init__(self) +# pylint: disable=too-many-ancestors +class Time(ChSqlaType, Interval): + """ + Represents the ClickHouse Time type, which corresponds to a timedelta. + + Represents time durations in the range -999:59:59 to 999:59:59 with + second precision. Maps to Python timedelta objects. + """ + + def __init__(self, type_def: TypeDef = EMPTY_TYPE_DEF): + ChSqlaType.__init__(self, type_def) + Interval.__init__(self) + + def process_bind_param(self, value, dialect): + return value + + def process_result_value(self, value, dialect): + return value + + def process_literal_param(self, value, dialect): + return None + + +# pylint: disable=too-many-ancestors +class Time64(ChSqlaType, Interval): + """ + Represents the ClickHouse Time64 type with configurable precision. + + Represents time durations in the range -999:59:59.999999999 to + 999:59:59.999999999 configurable precision. Maps to Python timedelta objects. + If no precision is defined it default to 3. + """ + + def __init__(self, precision: int = None, type_def: TypeDef = None): + """ + Time64 constructor with precision if not constructed with TypeDef. + :param precision: 3 (ms), 6 (us), or 9 (ns) for sub-second precision. + :param type_def: TypeDef from parse_name function. + """ + if not type_def: + if precision is None: + precision = 3 + + if precision not in (3, 6, 9): + raise ArgumentError( + f"Invalid precision value {precision} for ClickHouse Time64. Must be 3, 6, or 9." + ) + type_def = TypeDef(values=(precision,)) + else: + precision = type_def.values[0] if len(type_def.values) > 0 else 3 + + ChSqlaType.__init__(self, type_def) + + Interval.__init__(self, second_precision=precision) + + def process_bind_param(self, value, dialect): + return value + + def process_result_value(self, value, dialect): + return value + + def process_literal_param(self, value, dialect): + return None + + class Nullable: """ Class "wrapper" to use in DDL construction. It is never actually initialized but instead creates the "wrapped" diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/cc_sqlalchemy/ddl/custom.py b/contrib/python/clickhouse-connect/clickhouse_connect/cc_sqlalchemy/ddl/custom.py index 9f7c1e6401a..3b45b85641c 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/cc_sqlalchemy/ddl/custom.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/cc_sqlalchemy/ddl/custom.py @@ -11,7 +11,7 @@ class CreateDatabase(DDL): """ # pylint: disable-msg=too-many-arguments def __init__(self, name: str, engine: str = None, zoo_path: str = None, shard_name: str = '{shard}', - replica_name: str = '{replica}'): + replica_name: str = '{replica}', exists_ok: bool = False): """ :param name: Database name :param engine: Database ClickHouse engine type @@ -21,7 +21,7 @@ class CreateDatabase(DDL): """ if engine and engine not in ('Ordinary', 'Atomic', 'Lazy', 'Replicated'): raise ArgumentError(f'Unrecognized engine type {engine}') - stmt = f'CREATE DATABASE {quote_identifier(name)}' + stmt = f"CREATE DATABASE {'IF NOT EXISTS ' if exists_ok else ''}{quote_identifier(name)}" if engine: stmt += f' Engine {engine}' if engine == 'Replicated': @@ -36,5 +36,5 @@ class DropDatabase(DDL): """ Alternative DDL statement for built in SqlAlchemy DropSchema DDL class """ - def __init__(self, name: str): - super().__init__(f'DROP DATABASE {quote_identifier(name)}') + def __init__(self, name: str, missing_ok: bool = False): + super().__init__(f"DROP DATABASE {'IF EXISTS ' if missing_ok else ''}{quote_identifier(name)}") diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/cc_sqlalchemy/dialect.py b/contrib/python/clickhouse-connect/clickhouse_connect/cc_sqlalchemy/dialect.py index 0c1d7d79fe2..4a415914714 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/cc_sqlalchemy/dialect.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/cc_sqlalchemy/dialect.py @@ -1,4 +1,3 @@ - from sqlalchemy import text from sqlalchemy.engine.default import DefaultDialect @@ -7,6 +6,7 @@ from clickhouse_connect import dbapi from clickhouse_connect.cc_sqlalchemy.inspector import ChInspector from clickhouse_connect.cc_sqlalchemy.sql import full_table from clickhouse_connect.cc_sqlalchemy.sql.ddlcompiler import ChDDLCompiler +from clickhouse_connect.cc_sqlalchemy.sql.compiler import ChStatementCompiler from clickhouse_connect.cc_sqlalchemy import ischema_names, dialect_name from clickhouse_connect.cc_sqlalchemy.sql.preparer import ChIdentifierPreparer from clickhouse_connect.driver.binding import quote_identifier, format_str @@ -27,22 +27,29 @@ class ClickHouseDialect(DefaultDialect): returns_unicode_strings = True postfetch_lastrowid = False ddl_compiler = ChDDLCompiler + statement_compiler = ChStatementCompiler preparer = ChIdentifierPreparer description_encoding = None max_identifier_length = 127 ischema_names = ischema_names inspector = ChInspector + # SQA 1 compatibility # pylint: disable=method-hidden @classmethod def dbapi(cls): return dbapi + # SQA 2 compatibility + # pylint: disable=method-hidden + @classmethod + def import_dbapi(cls): + return dbapi + def initialize(self, connection): pass - @staticmethod - def get_schema_names(connection, **_): + def get_schema_names(self, connection, **_): return [row.name for row in connection.execute('SHOW DATABASES')] @staticmethod @@ -95,6 +102,7 @@ class ClickHouseDialect(DefaultDialect): def has_sequence(self, connection, sequence_name, schema=None, **_kw): return False + # pylint: disable=duplicate-code def do_begin_twophase(self, connection, xid): raise NotImplementedError diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/cc_sqlalchemy/inspector.py b/contrib/python/clickhouse-connect/clickhouse_connect/cc_sqlalchemy/inspector.py index d8936e67c23..53616d00f2a 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/cc_sqlalchemy/inspector.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/cc_sqlalchemy/inspector.py @@ -2,6 +2,7 @@ import sqlalchemy.schema as sa_schema from sqlalchemy.engine.reflection import Inspector from sqlalchemy.orm.exc import NoResultFound +from sqlalchemy import text from clickhouse_connect.cc_sqlalchemy.datatypes.base import sqla_type_from_name from clickhouse_connect.cc_sqlalchemy.ddl.tableengine import build_engine @@ -12,8 +13,8 @@ ch_col_args = ('default_type', 'codec_expression', 'ttl_expression') def get_engine(connection, table_name, schema=None): - result_set = connection.execute( - f"SELECT engine_full FROM system.tables WHERE database = '{schema}' and name = '{table_name}'") + result_set = connection.execute(text( + f"SELECT engine_full FROM system.tables WHERE database = '{schema}' and name = '{table_name}'")) row = next(result_set, None) if not row: raise NoResultFound(f'Table {schema}.{table_name} does not exist') @@ -35,7 +36,7 @@ class ChInspector(Inspector): def get_columns(self, table_name, schema=None, **_kwargs): table_id = full_table(table_name, schema) - result_set = self.bind.execute(f'DESCRIBE TABLE {table_id}') + result_set = self.bind.execute(text(f'DESCRIBE TABLE {table_id}')) if not result_set: raise NoResultFound(f'Table {full_table} does not exist') columns = [] @@ -52,6 +53,3 @@ class ChInspector(Inspector): 'ttl_expression': row.ttl_expression} columns.append(col) return columns - - -ChInspector.reflecttable = ChInspector.reflect_table # Hack to provide backward compatibility for SQLAlchemy 1.3 diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/cc_sqlalchemy/sql/compiler.py b/contrib/python/clickhouse-connect/clickhouse_connect/cc_sqlalchemy/sql/compiler.py new file mode 100644 index 00000000000..dab2099ef24 --- /dev/null +++ b/contrib/python/clickhouse-connect/clickhouse_connect/cc_sqlalchemy/sql/compiler.py @@ -0,0 +1,74 @@ +from sqlalchemy.exc import CompileError +from sqlalchemy.sql.compiler import SQLCompiler + +from clickhouse_connect.cc_sqlalchemy.sql import format_table + + +# pylint: disable=arguments-differ +class ChStatementCompiler(SQLCompiler): + + # pylint: disable=attribute-defined-outside-init + def visit_delete(self, delete_stmt, visiting_cte=None, **kw): + table = delete_stmt.table + text = f"DELETE FROM {format_table(table)}" + + if delete_stmt.whereclause is not None: + self._in_delete_where = True + try: + text += " WHERE " + self.process(delete_stmt.whereclause, **kw) + finally: + self._in_delete_where = False + else: + raise CompileError("ClickHouse DELETE statements require a WHERE clause. To delete all rows, use 'TRUNCATE TABLE' instead.") + + return text + + def visit_select(self, select_stmt, **kw): + return super().visit_select(select_stmt, **kw) + + def visit_join(self, join, **kw): + left = self.process(join.left, **kw) + right = self.process(join.right, **kw) + onclause = join.onclause + + if getattr(join, "full", False): + join_kw = " FULL OUTER JOIN " + elif onclause is None: + join_kw = " CROSS JOIN " + elif join.isouter: + join_kw = " LEFT OUTER JOIN " + else: + join_kw = " INNER JOIN " + + text = left + join_kw + right + + if onclause is not None: + text += " ON " + self.process(onclause, **kw) + + return text + + def visit_column(self, column, add_to_result_map=None, include_table=True, result_map_targets=(), ambiguous_table_name_map=None, **kw): + if getattr(self, "_in_delete_where", False): + return self.preparer.quote(column.name) + + return super().visit_column( + column, + add_to_result_map=add_to_result_map, + include_table=include_table, + result_map_targets=result_map_targets, + **kw, + ) + + # Abstract methods required by SQLCompiler + def delete_extra_from_clause(self, delete_stmt, from_table, extra_froms, from_hints, **kw): + raise NotImplementedError("ClickHouse doesn't support DELETE with extra FROM clause") + + def update_from_clause(self, update_stmt, from_table, extra_froms, from_hints, **kw): + raise NotImplementedError("ClickHouse doesn't support UPDATE with FROM clause") + + # pylint: disable=unused-argument + def visit_empty_set_expr(self, element_types, **kw): + return "SELECT 1 WHERE 1=0" + + def visit_sequence(self, sequence, **kw): + raise NotImplementedError("ClickHouse doesn't support sequences") diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/common.py b/contrib/python/clickhouse-connect/clickhouse_connect/common.py index dd0f319dce9..fd56a26616b 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/common.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/common.py @@ -77,6 +77,11 @@ _init_common('product_name', (), '') # Product name used as part of client iden _init_common('readonly', (0, 1), 0) # Implied "read_only" ClickHouse settings for versions prior to 19.17 _init_common('send_os_user', (True, False), True) +# Include integration tags (library name/version) in the User-Agent, e.g.: +# pandas/2.2.5; polars/0.20.x; sqlalchemy/2.0.x. These tags are only included +# when using relevant API methods. +_init_common('send_integration_tags', (True, False), True) + # Use the client protocol version This is needed for DateTime timezone columns but breaks with current version of # chproxy _init_common('use_protocol_version', (True, False), True) @@ -85,3 +90,8 @@ _init_common('max_error_size', (), 1024) # HTTP raw data buffer for streaming queries. This should not be reduced below 64KB to ensure compatibility with LZ4 compression _init_common('http_buffer_size', (), 10 * 1024 * 1024) + +# If True and using pandas 2.x, preserves the datetime64/timedelta64 +# dtype resolution (e.g., 's', 'ms', 'us', 'ns'). If False (or on +# pandas <2.x), coerces to nanosecond ('ns') resolution for compatibility. +_init_common('preserve_pandas_datetime_resolution', (True, False), False) diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/datatypes/base.py b/contrib/python/clickhouse-connect/clickhouse_connect/datatypes/base.py index c41694b71a7..d40eefe5a55 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/datatypes/base.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/datatypes/base.py @@ -41,6 +41,7 @@ class ClickHouseType(ABC): encoding = 'utf8' np_type = 'O' # Default to Numpy Object type nano_divisor = 0 # Only relevant for date like objects + pd_datetime_res = "ns" # Default date-like resolution for pd byte_size = 0 valid_formats = 'native' diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/datatypes/dynamic.py b/contrib/python/clickhouse-connect/clickhouse_connect/datatypes/dynamic.py index c5057b2b1b1..d7a9b21a540 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/datatypes/dynamic.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/datatypes/dynamic.py @@ -14,6 +14,8 @@ from clickhouse_connect.json_impl import any_to_json SHARED_DATA_TYPE: ClickHouseType STRING_DATA_TYPE: ClickHouseType +_JSON_NULL = b'null' +_JSON_NULL_STR = 'null' json_serialization_format = 0x1 @@ -129,13 +131,22 @@ def json_sample_size(_, sample: Collection) -> int: def write_json(ch_type: ClickHouseType, column: Sequence, dest: bytearray, ctx: InsertContext): + if ch_type.nullable: + dest += bytearray(1 if v is None else 0 for v in column) + first = first_value(column, ch_type.nullable) write_col = column encoding = ctx.encoding or ch_type.encoding if not isinstance(first, str) and ch_type.write_format(ctx) != 'string': to_json = any_to_json - write_col = [to_json(v) for v in column] + if ch_type.nullable: + write_col = [_JSON_NULL if v is None else to_json(v) for v in column] + else: + write_col = [to_json(v) for v in column] encoding = None + else: + write_col = [_JSON_NULL_STR if v is None else v for v in column] + handle_error(data_conv.write_str_col(write_col, ch_type.nullable, encoding, dest), ctx) diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/datatypes/network.py b/contrib/python/clickhouse-connect/clickhouse_connect/datatypes/network.py index 064b612feb2..7f13820f5b2 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/datatypes/network.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/datatypes/network.py @@ -1,5 +1,5 @@ import socket -from ipaddress import IPv4Address, IPv6Address +from ipaddress import ip_address, IPv4Address, IPv6Address from typing import Union, MutableSequence, Sequence, Any from clickhouse_connect.datatypes.base import ClickHouseType @@ -64,64 +64,60 @@ class IPv6(ClickHouseType): return self._read_binary_ip(source, num_rows) @staticmethod - def _read_binary_ip(source: ByteSource, num_rows: int): + def _read_binary_ip(source: ByteSource, num_rows: int) -> list[IPv6Address]: + """Read IPv6 addresses in native format, always returning IPv6Address objects.""" fast_ip_v6 = IPv6Address.__new__ - fast_ip_v4 = IPv4Address.__new__ with_scope_id = '_scope_id' in IPv6Address.__slots__ new_col = [] app = new_col.append ifb = int.from_bytes for _ in range(num_rows): int_value = ifb(source.read_bytes(16), 'big') - if int_value >> 32 == 0xFFFF: - ipv4 = fast_ip_v4(IPv4Address) - ipv4._ip = int_value & 0xFFFFFFFF - app(ipv4) - else: - ipv6 = fast_ip_v6(IPv6Address) - ipv6._ip = int_value - if with_scope_id: - ipv6._scope_id = None - app(ipv6) + ipv6 = fast_ip_v6(IPv6Address) + ipv6._ip = int_value + if with_scope_id: + ipv6._scope_id = None + app(ipv6) return new_col @staticmethod - def _read_binary_str(source: ByteSource, num_rows: int): + def _read_binary_str(source: ByteSource, num_rows: int) -> list[str]: + """Read IPv6 addresses in string format, always returning IPv6Address strings.""" new_col = [] app = new_col.append - v4mask = IPV4_V6_MASK - tov4 = socket.inet_ntoa tov6 = socket.inet_ntop af6 = socket.AF_INET6 for _ in range(num_rows): x = source.read_bytes(16) - if x[:12] == v4mask: - app(tov4(x[12:])) - else: - app(tov6(af6, x)) + # Always use IPv6 string representation, even for IPv4-mapped addresses + app(tov6(af6, x)) return new_col - def _write_column_binary(self, column: Union[Sequence, MutableSequence], dest: bytearray, ctx: InsertContext): - v = V6_NULL - first = first_value(column, self.nullable) - v4mask = IPV4_V6_MASK - af6 = socket.AF_INET6 - tov6 = socket.inet_pton - if isinstance(first, str): - for x in column: - if x is None: - dest += v - elif '.' in x: - dest += v4mask + bytes(int(b) for b in x.split('.')) - else: - dest += tov6(af6, x) - else: - for x in column: - if x is None: - dest += v - else: - b = x.packed - dest += b if len(b) == 16 else (v4mask + b) + def _write_column_binary( + self, + column: Union[Sequence, MutableSequence], + dest: bytearray, + ctx: InsertContext, + ): + """Write IPv6 addresses, promoting IPv4 addresses to IPv4-mapped IPv6 addresses.""" + for value in column: + if value is None: + dest += V6_NULL + continue + + try: + addr = ip_address(value) + except ValueError as e: + raise ValueError( + f"Failed to parse '{value}' as a valid IP address for column '{ctx.column_name}'" + ) from e + + # Now handle parsed object + if isinstance(addr, IPv6Address): + dest += addr.packed + elif isinstance(addr, IPv4Address): + # We have an IPv4, but the column is IPv6 so convert to IPv4-mapped. + dest += IPV4_V6_MASK + addr.packed def _active_null(self, ctx): if ctx.use_none: diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/datatypes/numeric.py b/contrib/python/clickhouse-connect/clickhouse_connect/datatypes/numeric.py index 7e86b623e3f..e074c96f6fc 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/datatypes/numeric.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/datatypes/numeric.py @@ -1,5 +1,7 @@ import decimal from typing import Union, Type, Sequence, MutableSequence, Any +import struct +import array from math import nan, isnan, isinf @@ -208,6 +210,80 @@ class Float64(Float): np_type = '<f8' +class BFloat16(ArrayType): + _array_type = "H" + python_type = float + np_type = "<f4" + + def _write_column_binary( + self, + column: Sequence[Any], + dest: bytearray, + ctx: InsertContext, + ): + if not column: + return + + if self.nullable: + first = next((x for x in column if x is not None), None) + if isinstance(first, float): + column = [0 if (x is None or isnan(x) or isinf(x)) else x for x in column] + else: + column = [0 if x is None else float(x) for x in column] + elif not isinstance(column[0], float): + column = [float(x) for x in column] + + vals = array.array("H") + extend = vals.extend + for x in column: + bits32 = struct.unpack("<I", struct.pack("<f", x))[0] + extend([bits32 >> 16]) + + write_array(self._array_type, vals, dest, ctx.column_name) + + def _read_column_binary( + self, source: ByteSource, num_rows: int, ctx: QueryContext, _read_state: Any + ): + if ctx.use_numpy: + arr16 = numpy_conv.read_numpy_array(source, "<u2", num_rows) + return (arr16.astype(np.uint32) << np.uint32(16)).view(np.float32) + + raw = source.read_array(self._array_type, num_rows) + return [struct.unpack("<f", struct.pack("<I", v << 16))[0] for v in raw] + + def _read_nullable_column( + self, source: ByteSource, num_rows: int, ctx: QueryContext, _read_state: Any + ): + null_map = source.read_bytes(num_rows) + + if ctx.use_numpy: + arr16 = numpy_conv.read_numpy_array(source, "<u2", num_rows) + floats = (arr16.astype(np.uint32) << np.uint32(16)).view(np.float32) + return data_conv.build_nullable_column( + floats, null_map, self._active_null(ctx) + ) + + raw = source.read_array(self._array_type, num_rows) + floats = [struct.unpack("<f", struct.pack("<I", v << 16))[0] for v in raw] + return data_conv.build_nullable_column(floats, null_map, self._active_null(ctx)) + + def _finalize_column(self, column, ctx: QueryContext): + if ctx.use_extended_dtypes and self.nullable: + return pd.array(column, dtype="Float32") + if ctx.use_numpy and not isinstance(column, np.ndarray): + return np.array(column, dtype=self.np_type) + return column + + def _active_null(self, ctx: QueryContext): + if ctx.use_extended_dtypes: + return nan + if ctx.use_none: + return None + if ctx.use_numpy: + return nan + return 0.0 + + class Bool(ClickHouseType): np_type = '?' python_type = bool @@ -388,3 +464,47 @@ class Decimal128(BigDecimal): class Decimal256(BigDecimal): dec_size = 256 + + +class IntervalNanosecond(Int32): + pass + + +class IntervalMicrosecond(Int32): + pass + + +class IntervalMillisecond(Int32): + pass + + +class IntervalSecond(Int32): + pass + + +class IntervalMinute(Int32): + pass + + +class IntervalHour(Int32): + pass + + +class IntervalDay(Int32): + pass + + +class IntervalWeek(Int32): + pass + + +class IntervalMonth(Int32): + pass + + +class IntervalQuarter(Int32): + pass + + +class IntervalYear(Int32): + pass diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/datatypes/temporal.py b/contrib/python/clickhouse-connect/clickhouse_connect/datatypes/temporal.py index 4b416a9159f..a51b6584f8f 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/datatypes/temporal.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/datatypes/temporal.py @@ -1,16 +1,22 @@ import pytz -from datetime import date, datetime, tzinfo -from typing import Union, Sequence, MutableSequence, Any +import array +from datetime import date, datetime, tzinfo, timedelta, time + +from typing import Union, Sequence, MutableSequence, Any, NamedTuple, Optional +from abc import abstractmethod +import re from clickhouse_connect.datatypes.base import TypeDef, ClickHouseType +from clickhouse_connect.common import get_setting +from clickhouse_connect.driver import tzutil from clickhouse_connect.driver.common import write_array, np_date_types, int_size, first_value from clickhouse_connect.driver.exceptions import ProgrammingError from clickhouse_connect.driver.ctypes import data_conv, numpy_conv from clickhouse_connect.driver.insert import InsertContext from clickhouse_connect.driver.query import QueryContext from clickhouse_connect.driver.types import ByteSource -from clickhouse_connect.driver.options import np, pd +from clickhouse_connect.driver.options import np, pd, IS_PANDAS_2 epoch_start_date = date(1970, 1, 1) epoch_start_datetime = datetime(1970, 1, 1) @@ -24,6 +30,12 @@ class Date(ClickHouseType): python_type = date byte_size = 2 + @property + def pandas_dtype(self): + if IS_PANDAS_2 and get_setting("preserve_pandas_datetime_resolution"): + return "datetime64[s]" + return f"datetime64[{self.pd_datetime_res}]" + def _read_column_binary(self, source: ByteSource, num_rows: int, ctx: QueryContext, _read_state:Any): if self.read_format(ctx) == 'int': return source.read_array(self._array_type, num_rows) @@ -56,14 +68,39 @@ class Date(ClickHouseType): if fmt == 'int': return 0 if ctx.use_numpy: - return np.datetime64(0) + return np.datetime64(0, self.pd_datetime_res) return epoch_start_date + # pylint: disable=too-many-return-statements def _finalize_column(self, column: Sequence, ctx: QueryContext) -> Sequence: if self.read_format(ctx) == 'int': return column + if ctx.use_numpy and self.nullable and not ctx.use_none: return np.array(column, dtype=self.np_type) + + if ctx.use_extended_dtypes: + if isinstance(column, np.ndarray) and np.issubdtype( + column.dtype, np.datetime64 + ): + return column.astype(self.pandas_dtype) + + if isinstance(column, pd.DatetimeIndex): + if column.tz is None: + return column.astype(self.pandas_dtype) + + naive = column.tz_localize(None).astype(self.pandas_dtype) + return pd.DatetimeIndex(naive, tz=column.tz) + + if self.nullable and isinstance(column, list): + return np.array([None if pd.isna(s) else s for s in column]).astype( + self.pandas_dtype + ) + + return pd.to_datetime(column, errors="coerce").to_numpy( + dtype=self.pandas_dtype, copy=False + ) + return column @@ -84,6 +121,13 @@ class DateTimeBase(ClickHouseType, registered=False): valid_formats = 'native', 'int' python_type = datetime + @property + def pandas_dtype(self): + """Sets dtype for pandas datetime objects""" + if IS_PANDAS_2 and get_setting("preserve_pandas_datetime_resolution"): + return "datetime64[s]" + return f"datetime64[{self.pd_datetime_res}]" + def _active_null(self, ctx: QueryContext): fmt = self.read_format(ctx) if ctx.use_extended_dtypes: @@ -93,9 +137,40 @@ class DateTimeBase(ClickHouseType, registered=False): if self.read_format(ctx) == 'int': return 0 if ctx.use_numpy: - return np.datetime64(0) + return np.datetime64(0, self.pd_datetime_res) return epoch_start_datetime + def _finalize_column(self, column: Sequence, ctx: QueryContext) -> Sequence: + """Ensure every datetime-like column is at nanosecond resolution, preserving any tz.""" + if ctx.use_extended_dtypes: + if isinstance(column, np.ndarray) and np.issubdtype( + column.dtype, np.datetime64 + ): + return column.astype(self.pandas_dtype) + + if isinstance(column, pd.DatetimeIndex) or ( + isinstance(column, list) + and hasattr(next((s for s in column if not pd.isna(s)), None), "tz") + ): + if isinstance(column, list): + column = pd.DatetimeIndex(column) + + if column.tz is None: + result = column.astype(self.pandas_dtype) + return pd.array(result) if self.nullable else result + + naive_ns = column.tz_localize(None).astype(self.pandas_dtype) + tz_aware_result = pd.DatetimeIndex(naive_ns, tz=column.tz) + return ( + pd.array(tz_aware_result) if self.nullable else tz_aware_result + ) + + if self.nullable: + return pd.array( + [None if pd.isna(s) else s for s in column], dtype=self.pandas_dtype + ) + return column + class DateTime(DateTimeBase): _array_type = 'L' if int_size == 2 else 'I' @@ -151,6 +226,13 @@ class DateTime64(DateTimeBase): self.tzinfo = None @property + def pandas_dtype(self): + """Sets dtype for pandas datetime objects""" + if IS_PANDAS_2 and get_setting("preserve_pandas_datetime_resolution"): + return f"datetime64{self.unit}" + return f"datetime64[{self.pd_datetime_res}]" + + @property def np_type(self): if self.unit: return f'datetime64{self.unit}' @@ -189,7 +271,7 @@ class DateTime64(DateTimeBase): def _read_binary_naive(self, column: Sequence): new_col = [] app = new_col.append - dt_from = datetime.utcfromtimestamp + dt_from = tzutil.utcfromtimestamp prec = self.prec for ticks in column: seconds = ticks // prec @@ -222,3 +304,465 @@ class DateTime64(DateTimeBase): else: column = [((int(x.timestamp()) * 1000000 + x.microsecond) * prec) // 1000000 for x in column] write_array('q', column, dest, ctx.column_name) + + +class _HMSParts(NamedTuple): + """Internal structure for parsed HMS time components.""" + + hours: int + minutes: int + seconds: int + frac: Optional[str] + is_negative: bool + + +class TimeBase(ClickHouseType, registered=False): + """ + Abstract base for ClickHouse Time and Time64 types. + + Subclasses must define: + - _array_type: Array type specifier (e.g. 'i' or 'q') + - byte_size: Size in bytes for binary representation + - np_type: NumPy array type (e.g. 'timedelta64[s]' or 'timedelta64[ns]') + + And implement these abstract methods: + - _string_to_ticks(self, str) -> int + - _timedelta_to_ticks(self, timedelta) -> int + - _ticks_to_timedelta(self, int) -> timedelta + - _ticks_to_string(self, int) -> str + - max_ticks and min_ticks properties + """ + + _HMS_RE = re.compile( + r"""^\s* + (?P<sign>-?) + (?P<hours>\d+): + (?P<minutes>\d+): + (?P<seconds>\d+) + (?:\.(?P<frac>\d+))? + \s*$""", + re.VERBOSE, + ) + + MAX_TIME_SECONDS = 999 * 3600 + 59 * 60 + 59 # 999:59:59 + MIN_TIME_SECONDS = -MAX_TIME_SECONDS # -999:59:59 + _MICROS_PER_SECOND = 1_000_000 + _NANOS_PER_SECOND = 1_000_000_000 + _SECONDS_PER_DAY = 86_400 + + _array_type: str + byte_size: int + np_type: str + valid_formats = ("native", "string", "int", "time") + python_type = timedelta + + def _read_column_binary( + self, + source: ByteSource, + num_rows: int, + ctx: QueryContext, + _read_state: Any, + ) -> Sequence: + """Read binary column data and convert to requested format.""" + ticks = source.read_array(self._array_type, num_rows) + fmt = self.read_format(ctx) + + if ctx.use_numpy: + return np.array( + [self._ticks_to_np_timedelta(t) for t in ticks], dtype=self.np_type + ) + + if fmt == "int": + return ticks + + if fmt == "string": + return [self._ticks_to_string(t) for t in ticks] + + if fmt == "time": + return [self._ticks_to_time(t) for t in ticks] + + return [self._ticks_to_timedelta(t) for t in ticks] + + def _write_column_binary( + self, + column: Sequence, + dest: bytearray, + ctx: InsertContext, + ): + """Write column data in binary format.""" + ticks = self._to_ticks_array(column) + write_array(self._array_type, ticks, dest, ctx.column_name) + + def _parse_core(self, time_str: str) -> _HMSParts: + """Parse an hhh:mm:ss[.fff] time literal.""" + match = self._HMS_RE.match(time_str) + if not match: + raise ValueError(f"Invalid time literal {time_str}") + + hours = int(match["hours"]) + minutes = int(match["minutes"]) + seconds = int(match["seconds"]) + + if hours > 999: + raise ValueError( + f"Hours out of range; cannot exceed 999: got {hours} in '{time_str}'" + ) + if not 0 <= minutes < 60: + raise ValueError( + f"Minutes out of range; must be 0-59: got {minutes} in '{time_str}'" + ) + if not 0 <= seconds < 60: + raise ValueError( + f"Seconds out of range; must be 0-59: got {seconds} in '{time_str}'" + ) + + return _HMSParts( + hours=hours, + minutes=minutes, + seconds=seconds, + frac=match["frac"], + is_negative=bool(match["sign"]), + ) + + def _to_ticks_array(self, column: Sequence) -> Sequence[int]: + """Convert column data to internal tick representation.""" + first = first_value(column, self.nullable) + expected_type = type(first) if first is not None else None + + if expected_type is None: + if self.nullable: + return [0] * len(column) + return [] + + converter_map = { + timedelta: self._timedelta_to_ticks, + time: self._time_to_ticks, + float: self._numerical_to_ticks, + int: self._numerical_to_ticks, + str: self._string_to_ticks, + } + if np is not None: + converter_map[np.timedelta64] = self._timedelta_to_ticks + converter_map[np.int64] = self._numerical_to_ticks + converter = converter_map.get(expected_type, None) + + if converter is None: + raise TypeError( + f"Unsupported column type '{expected_type.__name__}' for {self.__class__.__name__}. " + "Expected 'int', 'str', 'time', or 'timedelta'." + ) + + if self.nullable: + return [converter(x) if x is not None else 0 for x in column] + + return [converter(x) for x in column] + + def _validate_standard_range(self, ticks: int, original: Any) -> None: + """Validate that ticks is within valid ClickHouse range.""" + if not self.min_ticks <= ticks <= self.max_ticks: + raise ValueError(f"{original} out of range for {self.__class__.__name__}") + + def _validate_time_obj_range(self, ticks: int) -> None: + """Ensure ticks can form a valid datetime.time object.""" + if not self.min_time_ticks <= ticks <= self.max_time_ticks: + raise ValueError( + f"Ticks value {ticks} is outside valid range for datetime.time object." + ) + + def _numerical_to_ticks(self, value: Union[int, float, "np.int64"]) -> int: + """Convert numerical value to ticks, with range validation.""" + value = int(value) + self._validate_standard_range(value, value) + return value + + def _active_null(self, ctx: QueryContext): + """Return appropriate null value based on context.""" + fmt = self.read_format(ctx) + if ctx.use_extended_dtypes: + return pd.NA if fmt == "int" else pd.NaT + if ctx.use_none: + return None + if fmt == "int": + return 0 + if fmt == "string": + return "00:00:00" + if ctx.use_numpy: + return np.timedelta64("NaT") + + return timedelta(0) + + @property + def pandas_dtype(self): + """Sets dtype for pandas datetime objects""" + if IS_PANDAS_2 and get_setting("preserve_pandas_datetime_resolution"): + return "timedelta64[s]" + return f"timedelta64[{self.pd_datetime_res}]" + + def _finalize_column(self, column: Sequence, ctx: QueryContext) -> Sequence: + """Finalize column data based on context requirements.""" + if ctx.use_extended_dtypes: + if isinstance(column, np.ndarray) and np.issubdtype( + column.dtype, np.timedelta64 + ): + return column.astype(self.pandas_dtype) + + if isinstance(column, pd.TimedeltaIndex): + return column.astype(self.pandas_dtype) + + if self.nullable: + return np.array([None if pd.isna(s) else s for s in column]).astype( + self.pandas_dtype + ) + return column + + def _build_lc_column(self, index: Sequence, keys: array.array, ctx: QueryContext): + """Build low-cardinality column from index and keys.""" + if ctx.use_numpy: + return np.array([index[k] for k in keys], dtype=self.np_type) + + return super()._build_lc_column(index, keys, ctx) + + @abstractmethod + def _string_to_ticks(self, time_str: str) -> int: + """Parse a string into integer ticks.""" + raise NotImplementedError + + @abstractmethod + def _timedelta_to_ticks(self, td: Union[timedelta, "np.timedelta64"]) -> int: + """Convert a timedelta into integer ticks.""" + raise NotImplementedError + + @abstractmethod + def _ticks_to_time(self, ticks: int) -> time: + """Convert integer ticks into a time.""" + raise NotImplementedError + + @abstractmethod + def _time_to_ticks(self, t: time) -> int: + """Convert a time into integer ticks.""" + raise NotImplementedError + + @abstractmethod + def _ticks_to_timedelta(self, ticks: int) -> timedelta: + """Convert integer ticks into a timedelta.""" + raise NotImplementedError + + @abstractmethod + def _ticks_to_np_timedelta(self, ticks: int) -> timedelta: + """Convert integer ticks into an np.timedelta.""" + raise NotImplementedError + + @abstractmethod + def _ticks_to_string(self, ticks: int) -> str: + """Format integer ticks as a string.""" + raise NotImplementedError + + @property + def min_time_ticks(self) -> int: + """Minimum tick value representable by datetime.time type.""" + return 0 + + @property + @abstractmethod + def max_time_ticks(self) -> int: + """Maximum tick value representable by datetime.time type.""" + raise NotImplementedError + + @property + @abstractmethod + def max_ticks(self) -> int: + """Maximum tick value representable by this type.""" + raise NotImplementedError + + @property + @abstractmethod + def min_ticks(self) -> int: + """Minimum tick value representable by this type.""" + raise NotImplementedError + + +class Time(TimeBase): + """ClickHouse Time type with second precision.""" + + _array_type = "i" + byte_size = 4 + np_type = "timedelta64[s]" + + @property + def max_ticks(self) -> int: + return self.MAX_TIME_SECONDS + + @property + def min_ticks(self) -> int: + return self.MIN_TIME_SECONDS + + @property + def max_time_ticks(self) -> int: + return self._SECONDS_PER_DAY - 1 + + def _string_to_ticks(self, time_str: str) -> int: + """Parse string format 'HHH:MM:SS[.fff]' to ticks (seconds), flooring fractional seconds.""" + parts = self._parse_core(time_str) + ticks = parts.hours * 3600 + parts.minutes * 60 + parts.seconds + + if parts.is_negative: + ticks = -ticks + self._validate_standard_range(ticks, time_str) + + return ticks + + def _ticks_to_string(self, ticks: int) -> str: + """Format ticks (seconds) as 'HHH:MM:SS' string.""" + sign = "-" if ticks < 0 else "" + t = abs(ticks) + h, rem = divmod(t, 3600) + m, s = divmod(rem, 60) + + return f"{sign}{h:03d}:{m:02d}:{s:02d}" + + def _timedelta_to_ticks(self, td: Union[timedelta, "np.timedelta64"]) -> int: + """Convert timedelta to ticks (seconds), flooring fractional seconds.""" + if isinstance(td, timedelta): + total = int(td.total_seconds()) + else: + total = td.astype("timedelta64[s]").astype(int) + self._validate_standard_range(total, td) + + return total + + def _ticks_to_timedelta(self, ticks: int) -> timedelta: + """Convert ticks (seconds) to timedelta.""" + return timedelta(seconds=ticks) + + def _ticks_to_np_timedelta(self, ticks: int) -> timedelta: + """Convert ticks (seconds) to np.timedelta.""" + return np.timedelta64(ticks, "s") + + def _time_to_ticks(self, t: time) -> int: + """Converts time to ticks (seconds), flooring fraction seconds.""" + return t.hour * 3600 + t.minute * 60 + t.second + + def _ticks_to_time(self, ticks: int) -> time: + """Converts ticks (seconds) to time.""" + self._validate_time_obj_range(ticks) + h, rem = divmod(ticks, 3600) + m, s = divmod(rem, 60) + + return time(hour=h, minute=m, second=s) + + +class Time64(TimeBase): + """ClickHouse Time64 type with configurable sub-second precision.""" + + __slots__ = ("scale", "precision", "unit") + _array_type = "q" + byte_size = 8 + + def __init__(self, type_def): + super().__init__(type_def) + self._name_suffix = type_def.arg_str + self.scale = type_def.values[0] + if self.scale not in (3, 6, 9): + raise ProgrammingError( + f"Unsupported Time64 scale {self.scale}; " + "only 3, 6, or 9 are allowed for NumPy." + ) + self.precision = 10**self.scale + self.unit = np_date_types.get(self.scale) + + @property + def pandas_dtype(self): + """Sets dtype for pandas datetime objects""" + if IS_PANDAS_2 and get_setting("preserve_pandas_datetime_resolution"): + return f"timedelta64{self.unit}" + return f"timedelta64[{self.pd_datetime_res}]" + + @property + def max_time_ticks(self) -> int: + return self._SECONDS_PER_DAY * self.precision - 1 + + @property + def np_type(self) -> str: + return f"timedelta64{self.unit}" + + @property + def max_ticks(self) -> int: + return self.MAX_TIME_SECONDS * self.precision + (self.precision - 1) + + @property + def min_ticks(self) -> int: + return -self.max_ticks + + def _string_to_ticks(self, time_str: str) -> int: + """Parse string format 'HHH:MM:SS[.fff]' to ticks with sub-second precision.""" + parts = self._parse_core(time_str) + frac_ticks = int((parts.frac or "").ljust(self.scale, "0")[: self.scale]) + ticks = ( + parts.hours * 3600 + parts.minutes * 60 + parts.seconds + ) * self.precision + frac_ticks + if parts.is_negative: + ticks = -ticks + self._validate_standard_range(ticks, time_str) + + return ticks + + def _ticks_to_string(self, ticks: int) -> str: + """Format ticks as 'HHH:MM:SS[.fff]' string with sub-second precision.""" + sign = "-" if ticks < 0 else "" + t = abs(ticks) + sec_part, frac_part = divmod(t, self.precision) + h, rem = divmod(sec_part, 3600) + m, s = divmod(rem, 60) + frac_str = f".{frac_part:0{self.scale}d}" if self.scale else "" + + return f"{sign}{h:03d}:{m:02d}:{s:02d}{frac_str}" + + def _timedelta_to_ticks(self, td: Union[timedelta, "np.timedelta64"]) -> int: + """Convert timedelta to ticks with sub-second precision.""" + if isinstance(td, timedelta): + total_us = ( + int(td.total_seconds()) * self._MICROS_PER_SECOND + td.microseconds + ) + ticks = (total_us * self.precision) // self._MICROS_PER_SECOND + else: + ticks = td.astype("timedelta64[s]").astype(int) + self._validate_standard_range(ticks, td) + + return ticks + + def _ticks_to_timedelta(self, ticks: int) -> timedelta: + """Convert ticks to timedelta with microsecond precision.""" + neg = ticks < 0 + t = abs(ticks) + sec_part = t // self.precision + frac_part = t - sec_part * self.precision + micros = (frac_part * self._MICROS_PER_SECOND) // self.precision + td = timedelta(seconds=sec_part, microseconds=micros) + + return -td if neg else td + + def _ticks_to_np_timedelta(self, ticks: int) -> "np.timedelta64": + """Convert ticks to numpy timedelta64 with nanosecond precision.""" + res_map = {3: "ms", 6: "us", 9: "ns"} + + return np.timedelta64(ticks, res_map.get(self.scale)) + + def _time_to_ticks(self, t: time) -> int: + """Convert time to ticks with sub-second precision.""" + total_us = ( + t.hour * 3600 + t.minute * 60 + t.second + ) * self._MICROS_PER_SECOND + t.microsecond + ticks = (total_us * self.precision) // self._MICROS_PER_SECOND + self._validate_time_obj_range(ticks) + + return ticks + + def _ticks_to_time(self, ticks: int) -> time: + """Convert ticks to time with microsecond precision.""" + self._validate_time_obj_range(ticks) + sec_part, frac_part = divmod(ticks, self.precision) + h, rem = divmod(sec_part, 3600) + m, s = divmod(rem, 60) + micros = (frac_part * self._MICROS_PER_SECOND) // self.precision + + return time(hour=h, minute=m, second=s, microsecond=micros) diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/dbapi/connection.py b/contrib/python/clickhouse-connect/clickhouse_connect/dbapi/connection.py index d1b3cb7f3cd..2c5bff6741e 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/dbapi/connection.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/dbapi/connection.py @@ -29,6 +29,8 @@ class Connection: secure=secure, dsn=dsn, generic_args=kwargs) + + self.client._add_integration_tag("sqlalchemy") self.timezone = self.client.server_tz def close(self): diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/driver/__init__.py b/contrib/python/clickhouse-connect/clickhouse_connect/driver/__init__.py index 644bac7b96a..be72481ee56 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/driver/__init__.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/driver/__init__.py @@ -76,6 +76,9 @@ def create_client(*, validity. This option can be used if using an ssh_tunnel or other indirect means to an ClickHouse server where the `host` argument refers to the tunnel or proxy and not the actual ClickHouse server :param autogenerate_session_id If set, this will override the 'autogenerate_session_id' common setting. + :param form_encode_query_params If True, query parameters will be sent as form-encoded data in the request body + instead of as URL parameters. This is useful for queries with large parameter sets that might exceed URL length + limits. Only available for query operations (not inserts). Default: False :return: ClickHouse Connect Client instance """ if dsn: @@ -131,8 +134,8 @@ def default_port(interface: str, secure: bool): async def create_async_client(*, - host: str = None, - username: str = None, + host: Optional[str] = None, + username: Optional[str] = None, password: str = '', database: str = '__default__', interface: Optional[str] = None, @@ -194,6 +197,9 @@ async def create_async_client(*, validity. This option can be used if using an ssh_tunnel or other indirect means to an ClickHouse server where the `host` argument refers to the tunnel or proxy and not the actual ClickHouse server :param autogenerate_session_id If set, this will override the 'autogenerate_session_id' common setting. + :param form_encode_query_params If True, query parameters will be sent as form-encoded data in the request body + instead of as URL parameters. This is useful for queries with large parameter sets that might exceed URL length + limits. Only available for query operations (not inserts). Default: False :return: ClickHouse Connect Client instance """ diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/driver/asyncclient.py b/contrib/python/clickhouse-connect/clickhouse_connect/driver/asyncclient.py index 4bb9f080948..feaa4e22903 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/driver/asyncclient.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/driver/asyncclient.py @@ -343,6 +343,46 @@ class AsyncClient: result = await loop.run_in_executor(self.executor, _query_df) return result + async def query_df_arrow( + self, + query: str, + parameters: Optional[Union[Sequence, Dict[str, Any]]] = None, + settings: Optional[Dict[str, Any]] = None, + use_strings: Optional[bool] = None, + external_data: Optional[ExternalData] = None, + transport_settings: Optional[Dict[str, str]] = None, + dataframe_library: str = "pandas", + ) -> Union["pd.DataFrame", "pl.DataFrame"]: + """ + Query method using the ClickHouse Arrow format to return a DataFrame + with PyArrow dtype backend. This provides better performance and memory efficiency + compared to the standard query_df method, though fewer output formatting options. + + :param query: Query statement/format string + :param parameters: Optional dictionary used to format the query + :param settings: Optional dictionary of ClickHouse settings (key/string values) + :param use_strings: Convert ClickHouse String type to Arrow string type (instead of binary) + :param external_data: ClickHouse "external data" to send with query + :param transport_settings: Optional dictionary of transport level settings (HTTP headers, etc.) + :param dataframe_library: Library to use for DataFrame creation ("pandas" or "polars") + :return: DataFrame (pandas or polars based on dataframe_library parameter) + """ + + def _query_df_arrow(): + return self.client.query_df_arrow( + query=query, + parameters=parameters, + settings=settings, + use_strings=use_strings, + external_data=external_data, + transport_settings=transport_settings, + dataframe_library=dataframe_library + ) + + loop = asyncio.get_running_loop() + result = await loop.run_in_executor(self.executor, _query_df_arrow) + return result + async def query_df_stream(self, query: Optional[str] = None, parameters: Optional[Union[Sequence, Dict[str, Any]]] = None, @@ -378,6 +418,45 @@ class AsyncClient: result = await loop.run_in_executor(self.executor, _query_df_stream) return result + async def query_df_arrow_stream( + self, + query: str, + parameters: Optional[Union[Sequence, Dict[str, Any]]] = None, + settings: Optional[Dict[str, Any]] = None, + use_strings: Optional[bool] = None, + external_data: Optional[ExternalData] = None, + transport_settings: Optional[Dict[str, str]] = None, + dataframe_library: str = "pandas" + ) -> StreamContext: + """ + Query method that returns the results as a stream of DataFrames with PyArrow dtype backend. + Each DataFrame represents a block from the ClickHouse response. + + :param query: Query statement/format string + :param parameters: Optional dictionary used to format the query + :param settings: Optional dictionary of ClickHouse settings (key/string values) + :param use_strings: Convert ClickHouse String type to Arrow string type (instead of binary) + :param external_data: ClickHouse "external data" to send with query + :param transport_settings: Optional dictionary of transport level settings (HTTP headers, etc.) + :param dataframe_library: Library to use for DataFrame creation ("pandas" or "polars") + :return: StreamContext that yields DataFrames (pandas or polars based on dataframe_library parameter) + """ + + def _query_df_arrow_stream(): + return self.client.query_df_arrow_stream( + query=query, + parameters=parameters, + settings=settings, + use_strings=use_strings, + external_data=external_data, + transport_settings=transport_settings, + dataframe_library=dataframe_library + ) + + loop = asyncio.get_running_loop() + result = await loop.run_in_executor(self.executor, _query_df_arrow_stream) + return result + def create_query_context(self, query: Optional[Union[str, bytes]] = None, parameters: Optional[Union[Sequence, Dict[str, Any]]] = None, @@ -501,7 +580,7 @@ class AsyncClient: cmd: str, parameters: Optional[Union[Sequence, Dict[str, Any]]] = None, data: Union[str, bytes] = None, - settings: Dict[str, Any] = None, + settings: Optional[Dict[str, Any]] = None, use_database: bool = True, external_data: Optional[ExternalData] = None, transport_settings: Optional[Dict[str, str]] = None) -> Union[str, int, Sequence[str], QuerySummary]: @@ -642,6 +721,43 @@ class AsyncClient: result = await loop.run_in_executor(self.executor, _insert_arrow) return result + async def insert_df_arrow( + self, + table: str, + df: Union["pd.DataFrame", "pl.DataFrame"], + database: Optional[str] = None, + settings: Optional[Dict] = None, + transport_settings: Optional[Dict[str, str]] = None, + ) -> QuerySummary: + """ + Insert a pandas DataFrame with PyArrow backend or a polars DataFrame into ClickHouse using Arrow format. + This method is optimized for DataFrames that already use Arrow format, providing + better performance than the standard insert_df method. + + Validation is performed and an exception will be raised if this requirement is not met. + Polars DataFrames are natively Arrow-based and don't require additional validation. + + :param table: ClickHouse table name + :param df: Pandas DataFrame with PyArrow dtype backend or Polars DataFrame + :param database: Optional ClickHouse database name + :param settings: Optional dictionary of ClickHouse settings (key/string values) + :param transport_settings: Optional dictionary of transport level settings (HTTP headers, etc.) + :return: QuerySummary with summary information, throws exception if insert fails + """ + + def _insert_df_arrow(): + return self.client.insert_df_arrow( + table=table, + df=df, + database=database, + settings=settings, + transport_settings=transport_settings, + ) + + loop = asyncio.get_running_loop() + result = await loop.run_in_executor(self.executor, _insert_df_arrow) + return result + async def create_insert_context(self, table: str, column_names: Optional[Union[str, Sequence[str]]] = None, diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/driver/client.py b/contrib/python/clickhouse-connect/clickhouse_connect/driver/client.py index 86556f46c77..77dfcb9a9d0 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/driver/client.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/driver/client.py @@ -16,10 +16,10 @@ from clickhouse_connect.datatypes import dynamic as dynamic_module from clickhouse_connect.driver import tzutil from clickhouse_connect.driver.common import dict_copy, StreamContext, coerce_int, coerce_bool from clickhouse_connect.driver.constants import CH_VERSION_WITH_PROTOCOL, PROTOCOL_VERSION_WITH_LOW_CARD -from clickhouse_connect.driver.exceptions import ProgrammingError, OperationalError +from clickhouse_connect.driver.exceptions import ProgrammingError, OperationalError, DataError from clickhouse_connect.driver.external import ExternalData from clickhouse_connect.driver.insert import InsertContext -from clickhouse_connect.driver.options import check_arrow, check_pandas, check_numpy +from clickhouse_connect.driver.options import check_arrow, check_pandas, check_numpy, check_polars, pd, arrow, pl, IS_PANDAS_2 from clickhouse_connect.driver.summary import QuerySummary from clickhouse_connect.driver.models import ColumnDef, SettingDef, SettingStatus from clickhouse_connect.driver.query import QueryResult, to_arrow, to_arrow_batches, QueryContext, arrow_buffer @@ -30,6 +30,7 @@ logger = logging.getLogger(__name__) arrow_str_setting = 'output_format_arrow_string_as_string' +# pylint: disable=too-many-lines # pylint: disable=too-many-public-methods,too-many-arguments,too-many-positional-arguments,too-many-instance-attributes class Client(ABC): """ @@ -357,6 +358,7 @@ class Client(ABC): :return: Numpy array representing the result set """ check_numpy() + self._add_integration_tag("numpy") return self._context_query(locals(), use_numpy=True).np_result # pylint: disable=duplicate-code,too-many-arguments,unused-argument @@ -378,6 +380,7 @@ class Client(ABC): :return: Generator that yield a numpy array per block representing the result set """ check_numpy() + self._add_integration_tag("numpy") return self._context_query(locals(), use_numpy=True, streaming=True).np_stream # pylint: disable=duplicate-code,unused-argument @@ -403,6 +406,7 @@ class Client(ABC): :return: Pandas dataframe representing the result set """ check_pandas() + self._add_integration_tag("pandas") return self._context_query(locals(), use_numpy=True, as_pandas=True).df_result # pylint: disable=duplicate-code,unused-argument @@ -428,6 +432,7 @@ class Client(ABC): :return: Generator that yields a Pandas dataframe per block representing the result set """ check_pandas() + self._add_integration_tag("pandas") return self._context_query(locals(), use_numpy=True, as_pandas=True, streaming=True).df_stream @@ -547,6 +552,7 @@ class Client(ABC): :return: PyArrow.Table """ check_arrow() + self._add_integration_tag("arrow") settings = self._update_arrow_settings(settings, use_strings) return to_arrow(self.raw_query(query, parameters, @@ -573,6 +579,7 @@ class Client(ABC): :return: Generator that yields a PyArrow.Table for per block representing the result set """ check_arrow() + self._add_integration_tag("arrow") settings = self._update_arrow_settings(settings, use_strings) return to_arrow_batches(self.raw_stream(query, parameters, @@ -581,6 +588,111 @@ class Client(ABC): external_data=external_data, transport_settings=transport_settings)) + def query_df_arrow(self, + query: str, + parameters: Optional[Union[Sequence, Dict[str, Any]]] = None, + settings: Optional[Dict[str, Any]] = None, + use_strings: Optional[bool] = None, + external_data: Optional[ExternalData] = None, + transport_settings: Optional[Dict[str, str]] = None, + dataframe_library: str = "pandas" + ) -> Union["pd.DataFrame", "pl.DataFrame"]: + """ + Query method using the ClickHouse Arrow format to return a DataFrame + with PyArrow dtype backend. This provides better performance and memory efficiency + compared to the standard query_df method, though fewer output formatting options. + + :param query: Query statement/format string + :param parameters: Optional dictionary used to format the query + :param settings: Optional dictionary of ClickHouse settings (key/string values) + :param use_strings: Convert ClickHouse String type to Arrow string type (instead of binary) + :param external_data: ClickHouse "external data" to send with query + :param transport_settings: Optional dictionary of transport level settings (HTTP headers, etc.) + :param dataframe_library: Library to use for DataFrame creation ("pandas" or "polars") + :return: DataFrame (pandas or polars based on dataframe_library parameter) + """ + check_arrow() + + if dataframe_library == "pandas": + check_pandas() + self._add_integration_tag("pandas") + if not IS_PANDAS_2: + raise ProgrammingError("PyArrow-backed dtypes are only supported when using pandas 2.x.") + + def converter(table: arrow.Table) -> pd.DataFrame: + return table.to_pandas(types_mapper=pd.ArrowDtype, safe=False) + + elif dataframe_library == "polars": + check_polars() + self._add_integration_tag("polars") + + def converter(table: arrow.Table) -> pl.DataFrame: + return pl.from_arrow(table) + + else: + raise ValueError(f"dataframe_library must be 'pandas' or 'polars', got '{dataframe_library}'") + + arrow_table = self.query_arrow( + query=query, + parameters=parameters, + settings=settings, + use_strings=use_strings, + external_data=external_data, + transport_settings=transport_settings, + ) + + return converter(arrow_table) + + def query_df_arrow_stream(self, + query: str, + parameters: Optional[Union[Sequence, Dict[str, Any]]] = None, + settings: Optional[Dict[str, Any]] = None, + use_strings: Optional[bool] = None, + external_data: Optional[ExternalData] = None, + transport_settings: Optional[Dict[str, str]] = None, + dataframe_library: str = "pandas") -> StreamContext: + """ + Query method that returns the results as a stream of DataFrames with PyArrow dtype backend. + Each DataFrame represents a block from the ClickHouse response. + + :param query: Query statement/format string + :param parameters: Optional dictionary used to format the query + :param settings: Optional dictionary of ClickHouse settings (key/string values) + :param use_strings: Convert ClickHouse String type to Arrow string type (instead of binary) + :param external_data: ClickHouse "external data" to send with query + :param transport_settings: Optional dictionary of transport level settings (HTTP headers, etc.) + :param dataframe_library: Library to use for DataFrame creation ("pandas" or "polars") + :return: StreamContext that yields DataFrames (pandas or polars based on dataframe_library parameter) + """ + check_arrow() + if dataframe_library == "pandas": + check_pandas() + self._add_integration_tag("pandas") + if not IS_PANDAS_2: + raise ProgrammingError("PyArrow-backed dtypes are only supported when using pandas 2.x.") + + def converter(table: "arrow.Table") -> "pd.DataFrame": + return table.to_pandas(types_mapper=pd.ArrowDtype, safe=False) + elif dataframe_library == "polars": + check_polars() + self._add_integration_tag("polars") + + def converter(table: arrow.Table) -> pl.DataFrame: + return pl.from_arrow(table) + else: + raise ValueError(f"dataframe_library must be 'pandas' or 'polars', got '{dataframe_library}'") + settings = self._update_arrow_settings(settings, use_strings) + raw_stream = self.raw_stream( + query, parameters, settings, fmt="ArrowStream", external_data=external_data, transport_settings=transport_settings + ) + reader = arrow.ipc.open_stream(raw_stream) + + def df_generator(): + for batch in reader: + yield converter(batch) + + return StreamContext(raw_stream, df_generator()) + def _update_arrow_settings(self, settings: Optional[Dict[str, Any]], use_strings: Optional[bool]) -> Dict[str, Any]: @@ -702,6 +814,7 @@ class Client(ABC): :return: QuerySummary with summary information, throws exception if insert fails """ check_pandas() + self._add_integration_tag("pandas") if context is None: if column_names is None: column_names = df.columns @@ -731,11 +844,72 @@ class Client(ABC): :param transport_settings: Optional dictionary of transport level settings (HTTP headers, etc.) """ check_arrow() + self._add_integration_tag("arrow") full_table = table if '.' in table or not database else f'{database}.{table}' compression = self.write_compression if self.write_compression in ('zstd', 'lz4') else None column_names, insert_block = arrow_buffer(arrow_table, compression) return self.raw_insert(full_table, column_names, insert_block, settings, 'Arrow', transport_settings) + def insert_df_arrow(self, + table: str, + df: Union["pd.DataFrame", "pl.DataFrame"], + database: Optional[str] = None, + settings: Optional[Dict] = None, + transport_settings: Optional[Dict[str, str]] = None) -> QuerySummary: + """ + Insert a pandas DataFrame with PyArrow backend or a polars DataFrame into ClickHouse using Arrow format. + This method is optimized for DataFrames that already use Arrow format, providing + better performance than the standard insert_df method. + + Validation is performed and an exception will be raised if this requirement is not met. + Polars DataFrames are natively Arrow-based and don't require additional validation. + + :param table: ClickHouse table name + :param df: Pandas DataFrame with PyArrow dtype backend or Polars DataFrame + :param database: Optional ClickHouse database name + :param settings: Optional dictionary of ClickHouse settings (key/string values) + :param transport_settings: Optional dictionary of transport level settings (HTTP headers, etc.) + :return: QuerySummary with summary information, throws exception if insert fails + """ + check_arrow() + + if pd is not None and isinstance(df, pd.DataFrame): + df_lib = "pandas" + elif pl is not None and isinstance(df, pl.DataFrame): + df_lib = "polars" + else: + if pd is None and pl is None: + raise ImportError("A DataFrame library (pandas or polars) must be installed to use insert_df_arrow.") + raise TypeError(f"df must be either a pandas DataFrame or polars DataFrame, got {type(df).__name__}") + + if df_lib == "pandas": + if not IS_PANDAS_2: + raise ProgrammingError("PyArrow-backed dtypes are only supported when using pandas 2.x.") + + non_arrow_cols = [col for col, dtype in df.dtypes.items() if not isinstance(dtype, pd.ArrowDtype)] + if non_arrow_cols: + raise ProgrammingError( + f"insert_df_arrow requires all columns to use PyArrow dtypes. Non-Arrow columns found: [{', '.join(non_arrow_cols)}]. " + ) + try: + arrow_table = arrow.Table.from_pandas(df, preserve_index=False) + except Exception as e: + raise DataError(f"Failed to convert pandas DataFrame to Arrow table: {e}") from e + else: + try: + arrow_table = df.to_arrow() + except Exception as e: + raise DataError(f"Failed to convert polars DataFrame to Arrow table: {e}") from e + + self._add_integration_tag(df_lib) + return self.insert_arrow( + table=table, + arrow_table=arrow_table, + database=database, + settings=settings, + transport_settings=transport_settings, + ) + def create_insert_context(self, table: str, column_names: Optional[Union[str, Sequence[str]]] = None, @@ -823,6 +997,11 @@ class Client(ABC): return False return True + # pylint: disable=no-self-use + def _add_integration_tag(self, name: str) -> None: + """Transport hook to surface 3rd party lib integration info (default: no-op).""" + return + @abstractmethod def data_insert(self, context: InsertContext) -> QuerySummary: """ diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/driver/common.py b/contrib/python/clickhouse-connect/clickhouse_connect/driver/common.py index 38bbd527839..76ccc7f24a0 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/driver/common.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/driver/common.py @@ -2,12 +2,11 @@ import array import struct import sys -from typing import Sequence, MutableSequence, Dict, Optional, Union, Generator +from typing import Sequence, MutableSequence, Dict, Optional, Union, Generator, Callable from clickhouse_connect.driver.exceptions import ProgrammingError, StreamClosedError, DataError from clickhouse_connect.driver.types import Closable - # pylint: disable=invalid-name must_swap = sys.byteorder == 'big' int_size = array.array('i').itemsize @@ -218,3 +217,74 @@ class StreamContext: self._in_context = False self.source.close() self.gen = None + +# pylint: disable=too-many-return-statements +def get_rename_method(method: Optional[str]) -> Optional[Callable[[str], str]]: + def _to_camel(s: str) -> str: + if not s: + return "" + out, up = [], False + for ch in s: + if ch.isspace() or ch == "_": + up = True + elif up: + out.append(ch.upper()) + up = False + else: + out.append(ch) + return "".join(out) + + def _to_underscore(s: str) -> str: + if not s: + return "" + out, prev = [], 0 + for ch in s: + if ch.isspace(): + if prev == 0: + out.append("_") + prev = 1 + elif ch.isupper(): + if prev == 0: + out.append("_") + out.append(ch.lower()) + elif prev == 1: + out.append(ch.lower()) + else: + out.append(ch) + prev = 2 + else: + out.append(ch) + prev = 0 + return "".join(out)[1:] if out and out[0] == "_" else "".join(out) + + def _remove_prefix(s: str) -> str: + i = s.rfind(".") + return s[i + 1 :] if i >= 0 else s + + if not method: + return None + + name = method.strip().upper() + + if name == "NONE": + return None + if name == "REMOVE_PREFIX": + return _remove_prefix + if name == "TO_CAMELCASE": + return _to_camel + if name == "TO_CAMELCASE_WITHOUT_PREFIX": + return lambda s: _to_camel(_remove_prefix(s)) + if name == "TO_UNDERSCORE": + return _to_underscore + if name == "TO_UNDERSCORE_WITHOUT_PREFIX": + return lambda s: _to_underscore(_remove_prefix(s)) + + valid_options = [ + "none", + "remove_prefix", + "to_camelcase", + "to_camelcase_without_prefix", + "to_underscore", + "to_underscore_without_prefix", + ] + raise ValueError(f"Invalid option '{name}'. Expected one of {valid_options}") diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/driver/context.py b/contrib/python/clickhouse-connect/clickhouse_connect/driver/context.py index 0da870bcb43..b5d4fbeab5f 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/driver/context.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/driver/context.py @@ -1,6 +1,6 @@ import logging import re -from typing import Optional, Dict, Union, Any +from typing import Optional, Dict, Union, Any, Callable logger = logging.getLogger(__name__) @@ -44,6 +44,7 @@ class BaseQueryContext: self.use_extended_dtypes = use_extended_dtypes self._active_col_fmt = None self._active_col_type_fmts = _empty_map + self.column_renamer: Optional[Callable[[str], str]] = None def start_column(self, name: str): self.column_name = name diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/driver/dataconv.py b/contrib/python/clickhouse-connect/clickhouse_connect/driver/dataconv.py index 5acc49830e8..41d188dad16 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/driver/dataconv.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/driver/dataconv.py @@ -1,9 +1,11 @@ import array + from datetime import datetime, date, tzinfo from ipaddress import IPv4Address from typing import Sequence, Optional, Any from uuid import UUID, SafeUUID +from clickhouse_connect.driver import tzutil from clickhouse_connect.driver.common import int_size from clickhouse_connect.driver.errors import NONE_IN_NULLABLE_COLUMN from clickhouse_connect.driver.types import ByteSource @@ -29,7 +31,7 @@ def read_ipv4_col(source: ByteSource, num_rows: int): def read_datetime_col(source: ByteSource, num_rows: int, tz_info: Optional[tzinfo]): src_array = source.read_array('I', num_rows) if tz_info is None: - fts = datetime.utcfromtimestamp + fts = tzutil.utcfromtimestamp return [fts(ts) for ts in src_array] fts = datetime.fromtimestamp return [fts(ts, tz_info) for ts in src_array] diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/driver/httpclient.py b/contrib/python/clickhouse-connect/clickhouse_connect/driver/httpclient.py index c055c639675..0f36b87c531 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/driver/httpclient.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/driver/httpclient.py @@ -3,6 +3,8 @@ import json import logging import re import uuid +from importlib import import_module +from importlib.metadata import version as dist_version from base64 import b64encode from typing import Optional, Dict, Any, Sequence, Union, List, Callable, Generator, BinaryIO from urllib.parse import urlencode @@ -40,13 +42,15 @@ class HttpClient(Client): valid_transport_settings = {'database', 'buffer_size', 'session_id', 'compress', 'decompress', 'session_timeout', 'session_check', 'query_id', 'quota_key', - 'wait_end_of_query', 'client_protocol_version'} + 'wait_end_of_query', 'client_protocol_version', + 'role'} optional_transport_settings = {'send_progress_in_http_headers', 'http_headers_progress_interval_ms', 'enable_http_compression'} _owns_pool_manager = False - # pylint: disable=too-many-positional-arguments,too-many-arguments,too-many-locals,too-many-branches,too-many-statements,unused-argument + # R0917: too-many-positional-arguments + # pylint: disable=too-many-arguments,R0917,too-many-locals,too-many-branches,too-many-statements,unused-argument def __init__(self, interface: str, host: str, @@ -75,7 +79,9 @@ class HttpClient(Client): show_clickhouse_errors: Optional[bool] = None, autogenerate_session_id: Optional[bool] = None, tls_mode: Optional[str] = None, - proxy_path: str = ''): + proxy_path: str = '', + form_encode_query_params: bool = False, + rename_response_column: Optional[str] = None): """ Create an HTTP ClickHouse Connect client See clickhouse_connect.get_client for parameters @@ -85,6 +91,7 @@ class HttpClient(Client): proxy_path = '/' + proxy_path self.url = f'{interface}://{host}:{port}{proxy_path}' self.headers = {} + self.form_encode_query_params = form_encode_query_params self.params = dict_copy(HttpClient.params) ch_settings = dict_copy(settings, self.params) self.http = pool_mgr @@ -125,6 +132,7 @@ class HttpClient(Client): elif (not client_cert or tls_mode in ('strict', 'proxy')) and username: self.headers['Authorization'] = 'Basic ' + b64encode(f'{username}:{password}'.encode()).decode() + self._reported_libs = set() self.headers['User-Agent'] = common.build_client_name(client_name) self._read_format = self._write_format = 'Native' self._transform = NativeTransform() @@ -140,6 +148,7 @@ class HttpClient(Client): self._send_comp_setting = False self._progress_interval = None self._active_session = None + self._rename_response_column = rename_response_column # allow to override the global autogenerate_session_id setting via the constructor params _autogenerate_session_id = common.get_setting('autogenerate_session_id') \ @@ -211,18 +220,42 @@ class HttpClient(Client): if self.protocol_version: params['client_protocol_version'] = self.protocol_version context.block_info = True - params.update(context.bind_params) params.update(self._validate_settings(context.settings)) + context.rename_response_column = self._rename_response_column if not context.is_insert and columns_only_re.search(context.uncommented_query): - response = self._raw_request(f'{context.final_query}\n FORMAT JSON', - params, headers, retries=self.query_retries) + # Mirror normal query behavior for form encoding and external data + fmt_json_query = f'{context.final_query}\n FORMAT JSON' + if self.form_encode_query_params: + fields = {'query': fmt_json_query} + fields.update(context.bind_params) + if context.external_data: # Deal with form encoding + external data + params.update(context.external_data.query_params) + fields.update(context.external_data.form_data) + response = self._raw_request(bytes(), params, headers, retries=self.query_retries, fields=fields) + elif context.external_data: # Deal with external data without form encoding + fields = context.external_data.form_data + params.update(context.bind_params) + params.update(context.external_data.query_params) + params['query'] = fmt_json_query + response = self._raw_request(bytes(), params, headers, retries=self.query_retries, fields=fields) + else: # Legacy behavior (plain body, bind params in URL) + params.update(context.bind_params) + response = self._raw_request(fmt_json_query, + params, headers, retries=self.query_retries) json_result = json.loads(response.data) # ClickHouse will respond with a JSON object of meta, data, and some other objects # We just grab the column names and column types from the metadata sub object names: List[str] = [] types: List[ClickHouseType] = [] + renamer = context.column_renamer for col in json_result['meta']: - names.append(col['name']) + name = col['name'] + if renamer is not None: + try: + name = renamer(name) + except Exception as e: # pylint: disable=broad-exception-caught + logger.debug("Failed to rename col '%s'. Skipping rename. Error: %s", name, e) + names.append(name) types.append(registry.get_from_name(col['type'])) return QueryResult([], None, tuple(names), tuple(types)) @@ -231,12 +264,23 @@ class HttpClient(Client): if self._send_comp_setting: params['enable_http_compression'] = '1' final_query = self._prep_query(context) - if context.external_data: + fields = {} + # Setup additional query parameters and body + if self.form_encode_query_params: + body = bytes() + fields['query'] = final_query + fields.update(context.bind_params) + if context.external_data: + params.update(context.external_data.query_params) + fields.update(context.external_data.form_data) + elif context.external_data: + params.update(context.bind_params) body = bytes() params['query'] = final_query params.update(context.external_data.query_params) fields = context.external_data.form_data else: + params.update(context.bind_params) body = final_query fields = None headers['Content-Type'] = 'text/plain; charset=utf-8' @@ -380,25 +424,47 @@ class HttpClient(Client): return QuerySummary(self._summary(response)) def _error_handler(self, response: HTTPResponse, retried: bool = False) -> None: - if self.show_clickhouse_errors: + """ + Handles HTTP errors. Tries to be robust and provide maximum context. + """ + try: + body = "" + # Always try to read the response body for context. try: - err_content = get_response_data(response) + # get_response_data reads body and decodes it for the error message + raw_body = get_response_data(response) + body = common.format_error( + raw_body.decode(errors="backslashreplace") + ).strip() except Exception: # pylint: disable=broad-except - err_content = None - finally: - response.close() + # If we can't read or decode the body, we'll proceed without it + logger.warning("Failed to read error response body", exc_info=True) - err_str = f'HTTPDriver for {self.url} returned response code {response.status}' - err_code = response.headers.get(ex_header) - if err_code: - err_str = f'HTTPDriver for {self.url} received ClickHouse error code {err_code}' - if err_content: - err_msg = common.format_error(err_content.decode(errors='backslashreplace')) - if err_msg.startswith('Code'): - err_str = f'{err_str}\n {err_msg}' - else: - err_str = 'The ClickHouse server returned an error.' + # Build the error message + if self.show_clickhouse_errors: + err_code = response.headers.get(ex_header) + if err_code: + # Prioritize the specific ClickHouse exception code if it exists. + err_str = f"Received ClickHouse exception, code: {err_code}" + else: + # Otherwise, just use the generic HTTP status + err_str = f"HTTP driver received HTTP status {response.status}" + if body: + # Always append the body if it exists + err_str = f"{err_str}, server response: {body}" + else: + # Simple message for when detailed errors are disabled + err_str = "The ClickHouse server returned an error" + + # Add the URL for additional context + err_str = f"{err_str} (for url {self.url})" + + finally: + # Ensure closed response to prevent resource leaks + response.close() + + # Raise the appropriate exception class raise OperationalError(err_str) if retried else DatabaseError(err_str) from None def _raw_request(self, @@ -519,19 +585,75 @@ class HttpClient(Client): params = self._validate_settings(settings or {}) if use_database and self.database: params['database'] = self.database - params.update(bind_params) - if external_data: - if isinstance(final_query, bytes): - raise ProgrammingError('Cannot combine binary query data with `External Data`') + fields = {} + # Setup query body + if external_data and not self.form_encode_query_params and isinstance(final_query, bytes): + raise ProgrammingError("Binary query cannot be placed in URL when using External Data; enable form encoding.") + # Setup additional query parameters and body + if self.form_encode_query_params: + body = bytes() + fields['query'] = final_query + fields.update(bind_params) + if external_data: + params.update(external_data.query_params) + fields.update(external_data.form_data) + elif external_data: + params.update(bind_params) body = bytes() params['query'] = final_query params.update(external_data.query_params) fields = external_data.form_data else: + params.update(bind_params) body = final_query fields = None return body, params, fields + # pylint: disable=broad-exception-caught + def _add_integration_tag(self, name: str): + """ + Dynamically adds a product (like pandas or sqlalchemy) to the User-Agent string details section. + """ + if not common.get_setting("send_integration_tags") or name in self._reported_libs: + return + + try: + ver = "unknown" + try: + ver = dist_version(name) + except Exception: + try: + mod = import_module(name) + ver = getattr(mod, "__version__", "unknown") + except Exception: + pass + + product_info = f"{name}/{ver}" + + ua = self.headers.get("User-Agent", "") + start = ua.find("(") + if start == -1: + return + end = ua.find(")", start + 1) + if end == -1: + return + + details = ua[start + 1 : end].strip() + + if product_info in details: + self._reported_libs.add(name) + return + + new_details = f"{product_info}; {details}" if details else product_info + new_ua = f"{ua[: start + 1]}{new_details}{ua[end:]}" + self.headers["User-Agent"] = new_ua.strip() + + self._reported_libs.add(name) + logger.debug("Added '%s' to User-Agent", product_info) + + except Exception as e: + logger.debug("Problem adding '%s' to User-Agent: %s", name, e) + def ping(self): """ See BaseClient doc_string for this method diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/driver/httputil.py b/contrib/python/clickhouse-connect/clickhouse_connect/driver/httputil.py index 37bb24e616e..c649b9a2fe9 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/driver/httputil.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/driver/httputil.py @@ -1,5 +1,5 @@ import atexit -import http +import http.client import logging import multiprocessing import os diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/driver/options.py b/contrib/python/clickhouse-connect/clickhouse_connect/driver/options.py index 4cec665c03a..c8a3834bc0b 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/driver/options.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/driver/options.py @@ -2,6 +2,8 @@ from clickhouse_connect.driver.exceptions import NotSupportedError pd_time_test = None pd_extended_dtypes = False +PANDAS_VERSION = None +IS_PANDAS_2 = None try: import numpy as np @@ -10,6 +12,8 @@ except ImportError: try: import pandas as pd + PANDAS_VERSION = tuple(map(int, pd.__version__.split(".")[:2])) + IS_PANDAS_2 = PANDAS_VERSION >= (2, 0) pd_extended_dtypes = not pd.__version__.startswith('0') try: from pandas.core.dtypes.common import is_datetime64_dtype @@ -33,6 +37,10 @@ try: except ImportError: arrow = None +try: + import polars as pl +except ImportError: + pl = None def check_numpy(): if np: @@ -50,3 +58,9 @@ def check_arrow(): if arrow: return arrow raise NotSupportedError('PyArrow package is not installed') + + +def check_polars(): + if pl: + return pl + raise NotSupportedError("Polars package is not installed") diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/driver/query.py b/contrib/python/clickhouse-connect/clickhouse_connect/driver/query.py index 6c764cfaea6..de490a5744a 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/driver/query.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/driver/query.py @@ -10,7 +10,7 @@ from pytz.exceptions import UnknownTimeZoneError from clickhouse_connect.driver import tzutil from clickhouse_connect.driver.binding import bind_query -from clickhouse_connect.driver.common import dict_copy, empty_gen, StreamContext +from clickhouse_connect.driver.common import dict_copy, empty_gen, StreamContext, get_rename_method from clickhouse_connect.driver.external import ExternalData from clickhouse_connect.driver.types import Matrix, Closable from clickhouse_connect.driver.exceptions import StreamClosedError, ProgrammingError @@ -53,7 +53,8 @@ class QueryContext(BaseQueryContext): streaming: bool = False, apply_server_tz: bool = False, external_data: Optional[ExternalData] = None, - transport_settings: Optional[Dict[str, str]] = None): + transport_settings: Optional[Dict[str, str]] = None, + rename_response_column: Optional[str] = None): """ Initializes various configuration settings for the query context @@ -118,9 +119,20 @@ class QueryContext(BaseQueryContext): self.as_pandas = as_pandas self.use_pandas_na = as_pandas and pd_extended_dtypes self.streaming = streaming + self._rename_response_column: Optional[str] = rename_response_column + self.column_renamer = get_rename_method(rename_response_column) self._update_query() @property + def rename_response_column(self) -> Optional[str]: + return self._rename_response_column + + @rename_response_column.setter + def rename_response_column(self, method: Optional[str]): + self._rename_response_column = method + self.column_renamer = get_rename_method(method) + + @property def is_select(self) -> bool: return select_re.search(self.uncommented_query) is not None @@ -192,7 +204,8 @@ class QueryContext(BaseQueryContext): as_pandas: bool = False, streaming: bool = False, external_data: Optional[ExternalData] = None, - transport_settings: Optional[Dict[str, str]] = None) -> 'QueryContext': + transport_settings: Optional[Dict[str, str]] = None, + rename_response_column: Optional[str] = None) -> 'QueryContext': """ Creates Query context copy with parameters overridden/updated as appropriate. """ @@ -214,7 +227,8 @@ class QueryContext(BaseQueryContext): streaming, self.apply_server_tz, self.external_data if external_data is None else external_data, - self.transport_settings if transport_settings is None else transport_settings) + self.transport_settings if transport_settings is None else transport_settings, + self.rename_response_column if rename_response_column is None else rename_response_column) def _update_query(self): self.final_query, self.bind_params = bind_query(self.query, self.parameters, self.server_tz) diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/driver/transform.py b/contrib/python/clickhouse-connect/clickhouse_connect/driver/transform.py index b5ae795c9ad..9edee9e10f2 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/driver/transform.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/driver/transform.py @@ -22,6 +22,7 @@ class NativeTransform: names = [] col_types = [] block_num = 0 + renamer = context.column_renamer def get_block(): nonlocal block_num @@ -35,10 +36,11 @@ class NativeTransform: return None num_rows = source.read_leb128() for col_num in range(num_cols): - name = source.read_leb128_str() + orig_name = source.read_leb128_str() type_name = source.read_leb128_str() if block_num == 0: - names.append(name) + disp_name = renamer(orig_name) if renamer is not None else orig_name + names.append(disp_name) col_type = registry.get_from_name(type_name) col_types.append(col_type) else: @@ -46,7 +48,7 @@ class NativeTransform: if num_rows == 0: result_block.append(tuple()) else: - context.start_column(name) + context.start_column(orig_name) column = col_type.read_column(source, num_rows, context) result_block.append(column) except Exception as ex: diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/driver/tzutil.py b/contrib/python/clickhouse-connect/clickhouse_connect/driver/tzutil.py index c27b51dd72c..d2157c14766 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/driver/tzutil.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/driver/tzutil.py @@ -33,6 +33,9 @@ def normalize_timezone(timezone: pytz.timezone) -> Tuple[pytz.timezone, bool]: return timezone, False +def utcfromtimestamp(ts: float) -> datetime: + return datetime.fromtimestamp(ts, tz=pytz.UTC).replace(tzinfo=None) + try: local_tz = pytz.timezone(os.environ.get('TZ', '')) except pytz.UnknownTimeZoneError: diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/driverc/dataconv.pyx b/contrib/python/clickhouse-connect/clickhouse_connect/driverc/dataconv.pyx index da3718d1163..713b3c24575 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/driverc/dataconv.pyx +++ b/contrib/python/clickhouse-connect/clickhouse_connect/driverc/dataconv.pyx @@ -19,6 +19,7 @@ from uuid import UUID, SafeUUID from libc.string cimport memcpy from datetime import tzinfo +from clickhouse_connect.driver import tzutil from clickhouse_connect.driver.errors import NONE_IN_NULLABLE_COLUMN @cython.boundscheck(False) @@ -63,7 +64,7 @@ def read_datetime_col(ResponseBuffer buffer, unsigned long long num_rows, tzinfo cdef char * loc = buffer.read_bytes_c(4 * num_rows) cdef object column = PyTuple_New(num_rows), v if tzinfo is None: - fts = datetime.utcfromtimestamp + fts = tzutil.utcfromtimestamp while x < num_rows: v = fts((<unsigned int*>loc)[0]) PyTuple_SET_ITEM(column, x, v) diff --git a/contrib/python/clickhouse-connect/clickhouse_connect/tools/datagen.py b/contrib/python/clickhouse-connect/clickhouse_connect/tools/datagen.py index 490d8529166..f956d382a71 100644 --- a/contrib/python/clickhouse-connect/clickhouse_connect/tools/datagen.py +++ b/contrib/python/clickhouse-connect/clickhouse_connect/tools/datagen.py @@ -16,9 +16,10 @@ from clickhouse_connect.datatypes.registry import get_from_name from clickhouse_connect.datatypes.special import UUID from clickhouse_connect.datatypes.string import String, FixedString from clickhouse_connect.datatypes.temporal import Date, Date32, DateTime, DateTime64 +from clickhouse_connect.driver import tzutil from clickhouse_connect.driver.common import array_sizes -dt_from_ts = datetime.utcfromtimestamp +dt_from_ts = tzutil.utcfromtimestamp dt_from_ts_tz = datetime.fromtimestamp epoch_date = date(1970, 1, 1) date32_start_date = date(1925, 1, 1) @@ -181,7 +182,9 @@ def random_ipv6(): ip_int = (int(random() * 4294967296) << 96) | (int(random() * 4294967296)) | ( int(random() * 4294967296) << 32) | ( int(random() * 4294967296) << 64) return IPv6Address(ip_int) - return IPv4Address(int(random() * 2 ** 32)) + # Return mapped IPv4 as IPv6 + ipv4_int = int(random() * 2 ** 32) + return IPv6Address(f"::ffff:{IPv4Address(ipv4_int)}") def random_nested(keys: Sequence[str], types: Sequence[ClickHouseType], col_def: RandomValueDef): diff --git a/contrib/python/clickhouse-connect/ya.make b/contrib/python/clickhouse-connect/ya.make index cead3e1f6d9..7cc8b67f8a5 100644 --- a/contrib/python/clickhouse-connect/ya.make +++ b/contrib/python/clickhouse-connect/ya.make @@ -2,7 +2,7 @@ PY3_LIBRARY() -VERSION(0.8.18) +VERSION(0.9.2) LICENSE(Apache-2.0) @@ -40,6 +40,7 @@ PY_SRCS( clickhouse_connect/cc_sqlalchemy/dialect.py clickhouse_connect/cc_sqlalchemy/inspector.py clickhouse_connect/cc_sqlalchemy/sql/__init__.py + clickhouse_connect/cc_sqlalchemy/sql/compiler.py clickhouse_connect/cc_sqlalchemy/sql/ddlcompiler.py clickhouse_connect/cc_sqlalchemy/sql/preparer.py clickhouse_connect/common.py diff --git a/contrib/python/pytest-lazy-fixtures/.dist-info/METADATA b/contrib/python/pytest-lazy-fixtures/.dist-info/METADATA index 377892b4c57..9d32737ca3c 100644 --- a/contrib/python/pytest-lazy-fixtures/.dist-info/METADATA +++ b/contrib/python/pytest-lazy-fixtures/.dist-info/METADATA @@ -1,6 +1,6 @@ Metadata-Version: 2.4 Name: pytest-lazy-fixtures -Version: 1.3.4 +Version: 1.4.0 Summary: Allows you to use fixtures in @pytest.mark.parametrize. Project-URL: Homepage, https://github.com/dev-petrov/pytest-lazy-fixtures Project-URL: Repository, https://github.com/dev-petrov/pytest-lazy-fixtures @@ -88,7 +88,7 @@ def test_func(arg1, arg2): ``` And there is some useful wrapper called `lfc` (`lazy_fixture_callable`). -It can work with any callable and your fixtures, e.g. +It can work with any callable and your fixtures. ```python import pytest @@ -123,6 +123,12 @@ def entity_format(): return _entity_format +# Recommended: implicit injection from the callable's parameter names [email protected]("as_text", [lfc(lambda entity: str(entity))]) +def test_lfc_injection_basic(as_text): + assert as_text == "1" + +# Advanced: manual control over what to inject (explicit values into a callable) @pytest.mark.parametrize( "message", [ @@ -149,6 +155,68 @@ def test_lazy_fixture_callable_with_attr_lf(result): assert result == 3 ``` +### Injecting fixtures from a callable signature + +`lfc` automatically injects fixtures based on the callable's parameter names. If a parameter name matches a fixture name, that fixture will be resolved and passed into the callable. This implicit injection is the standard, recommended behavior. You can still override any parameter explicitly using `lf`, either positionally or by name. +Default values prevent implicit injection for that parameter. + +Examples: + +```python +import pytest +from pytest_lazy_fixtures import lf, lfc + +# Some fixtures used in the examples +def alpha(): + return 10 + +def beta(): + return 20 + +def gamma(): + return 30 + +# Simple implicit injection by parameter name [email protected]("result", [lfc(lambda alpha: alpha + 1)]) +def test_signature_injection_basic(result): + assert result == 11 + +# Override an implicitly matched parameter (positional or keyword) + "result", + [ + lfc(lambda alpha: alpha + 1, lf("beta")), # positional override + lfc(lambda alpha: alpha + 1, alpha=lf("beta")), # keyword override + ], +) +def test_signature_injection_override(result): + assert result == 21 + +# Mix implicit and explicit params + "pair", + [ + lfc(lambda alpha, beta: (alpha, beta), beta=lf("gamma")), # first implicit, second explicit + lfc(lambda alpha, beta: (alpha, beta), lf("gamma")), # first explicit, second implicit + ], +) +def test_signature_injection_mixed(pair): + assert pair in [(10, 30), (30, 20)] + +# 4) Defaults prevent implicit injection for that parameter [email protected]("val", [lfc(lambda alpha=999: alpha)]) +def test_signature_injection_defaults(val): + assert val == 999 +``` + +Notes: +- Implicit injection only happens for parameters without an explicit value and without a default. +- Explicit arguments passed to `lfc` (positional or keyword) always take precedence over implicit injection. +- The mapping of positional explicit arguments follows the callable's parameter order. + ## Contributing Contributions are very welcome. Tests can be run with `pytest`. diff --git a/contrib/python/pytest-lazy-fixtures/README.md b/contrib/python/pytest-lazy-fixtures/README.md index e77941bd699..1fba577e9d4 100644 --- a/contrib/python/pytest-lazy-fixtures/README.md +++ b/contrib/python/pytest-lazy-fixtures/README.md @@ -74,7 +74,7 @@ def test_func(arg1, arg2): ``` And there is some useful wrapper called `lfc` (`lazy_fixture_callable`). -It can work with any callable and your fixtures, e.g. +It can work with any callable and your fixtures. ```python import pytest @@ -109,6 +109,12 @@ def entity_format(): return _entity_format +# Recommended: implicit injection from the callable's parameter names [email protected]("as_text", [lfc(lambda entity: str(entity))]) +def test_lfc_injection_basic(as_text): + assert as_text == "1" + +# Advanced: manual control over what to inject (explicit values into a callable) @pytest.mark.parametrize( "message", [ @@ -135,6 +141,68 @@ def test_lazy_fixture_callable_with_attr_lf(result): assert result == 3 ``` +### Injecting fixtures from a callable signature + +`lfc` automatically injects fixtures based on the callable's parameter names. If a parameter name matches a fixture name, that fixture will be resolved and passed into the callable. This implicit injection is the standard, recommended behavior. You can still override any parameter explicitly using `lf`, either positionally or by name. +Default values prevent implicit injection for that parameter. + +Examples: + +```python +import pytest +from pytest_lazy_fixtures import lf, lfc + +# Some fixtures used in the examples +def alpha(): + return 10 + +def beta(): + return 20 + +def gamma(): + return 30 + +# Simple implicit injection by parameter name [email protected]("result", [lfc(lambda alpha: alpha + 1)]) +def test_signature_injection_basic(result): + assert result == 11 + +# Override an implicitly matched parameter (positional or keyword) + "result", + [ + lfc(lambda alpha: alpha + 1, lf("beta")), # positional override + lfc(lambda alpha: alpha + 1, alpha=lf("beta")), # keyword override + ], +) +def test_signature_injection_override(result): + assert result == 21 + +# Mix implicit and explicit params + "pair", + [ + lfc(lambda alpha, beta: (alpha, beta), beta=lf("gamma")), # first implicit, second explicit + lfc(lambda alpha, beta: (alpha, beta), lf("gamma")), # first explicit, second implicit + ], +) +def test_signature_injection_mixed(pair): + assert pair in [(10, 30), (30, 20)] + +# 4) Defaults prevent implicit injection for that parameter [email protected]("val", [lfc(lambda alpha=999: alpha)]) +def test_signature_injection_defaults(val): + assert val == 999 +``` + +Notes: +- Implicit injection only happens for parameters without an explicit value and without a default. +- Explicit arguments passed to `lfc` (positional or keyword) always take precedence over implicit injection. +- The mapping of positional explicit arguments follows the callable's parameter order. + ## Contributing Contributions are very welcome. Tests can be run with `pytest`. diff --git a/contrib/python/pytest-lazy-fixtures/pytest_lazy_fixtures/lazy_fixture_callable.py b/contrib/python/pytest-lazy-fixtures/pytest_lazy_fixtures/lazy_fixture_callable.py index 4e371763f7d..b073f4e1d3a 100644 --- a/contrib/python/pytest-lazy-fixtures/pytest_lazy_fixtures/lazy_fixture_callable.py +++ b/contrib/python/pytest-lazy-fixtures/pytest_lazy_fixtures/lazy_fixture_callable.py @@ -1,5 +1,6 @@ from __future__ import annotations +import inspect from inspect import isfunction from typing import TYPE_CHECKING, Callable @@ -9,22 +10,56 @@ if TYPE_CHECKING: import pytest +def _fill_unbound_params( + func: Callable[..., object], args: tuple[object, ...], kwargs: dict[str, object] +) -> dict[str, object]: + """Analyze a callable's signature and bind lazy fixtures to unbound parameters. + + This function takes a callable and its provided arguments, examines the callable's + signature, and returns a dictionary that maps any unbound parameter names to their + corresponding lazy fixtures. + + Only parameters that are not already bound through the provided args/kwargs will + get mapped to lazy fixtures. Any parameters that have existing values provided + maintain those values. + """ + + try: + sig = inspect.signature(func) + except (ValueError, TypeError): + # Cowardly refuse to figure out the missing params + return {} + + bound = sig.bind_partial(*args, **kwargs) + + # Apply the defaults arguments, as we don't want to override them. + bound.apply_defaults() + + unbound = [name for name in sig.parameters if name not in bound.arguments] + + return {unbound_param: LazyFixtureWrapper(unbound_param) for unbound_param in unbound} + + class LazyFixtureCallableWrapper(LazyFixtureWrapper): _func: Callable | None args: tuple kwargs: dict def __init__(self, callable_or_name: Callable | str, *args, **kwargs): + self.args = args + self.kwargs = kwargs + if callable(callable_or_name): self._func = callable_or_name self.name = ( callable_or_name.__name__ if isfunction(callable_or_name) else callable_or_name.__class__.__name__ ) + # If we have a direct callable, analyze its signature and pre-fill + # any unbound parameters with lf(param_name). + self.kwargs.update(_fill_unbound_params(self._func, self.args, self.kwargs)) else: self.name = callable_or_name self._func = None - self.args = args - self.kwargs = kwargs def get_func(self, request: pytest.FixtureRequest) -> Callable: func = self._func diff --git a/contrib/python/pytest-lazy-fixtures/ya.make b/contrib/python/pytest-lazy-fixtures/ya.make index e8efb1b06d8..909297765fb 100644 --- a/contrib/python/pytest-lazy-fixtures/ya.make +++ b/contrib/python/pytest-lazy-fixtures/ya.make @@ -2,7 +2,7 @@ PY3_LIBRARY() -VERSION(1.3.4) +VERSION(1.4.0) LICENSE(MIT) diff --git a/contrib/python/pytest-mock/py3/.dist-info/METADATA b/contrib/python/pytest-mock/py3/.dist-info/METADATA index 5b6926488ea..fea492d9e38 100644 --- a/contrib/python/pytest-mock/py3/.dist-info/METADATA +++ b/contrib/python/pytest-mock/py3/.dist-info/METADATA @@ -1,6 +1,6 @@ Metadata-Version: 2.4 Name: pytest-mock -Version: 3.15.0 +Version: 3.15.1 Summary: Thin-wrapper around the mock package for easier use with pytest Author-email: Bruno Oliveira <[email protected]> License: MIT diff --git a/contrib/python/pytest-mock/py3/pytest_mock/_util.py b/contrib/python/pytest-mock/py3/pytest_mock/_util.py index ad830caeefa..d3a732ac278 100644 --- a/contrib/python/pytest-mock/py3/pytest_mock/_util.py +++ b/contrib/python/pytest-mock/py3/pytest_mock/_util.py @@ -15,7 +15,7 @@ def get_mock_module(config): config.getini("mock_use_standalone_module") ) if use_standalone_module: - from unittest import mock + import mock _mock_module = mock else: diff --git a/contrib/python/pytest-mock/py3/pytest_mock/_version.py b/contrib/python/pytest-mock/py3/pytest_mock/_version.py index 7b8272a4140..7b769ace60a 100644 --- a/contrib/python/pytest-mock/py3/pytest_mock/_version.py +++ b/contrib/python/pytest-mock/py3/pytest_mock/_version.py @@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE commit_id: COMMIT_ID __commit_id__: COMMIT_ID -__version__ = version = '3.15.0' -__version_tuple__ = version_tuple = (3, 15, 0) +__version__ = version = '3.15.1' +__version_tuple__ = version_tuple = (3, 15, 1) __commit_id__ = commit_id = None diff --git a/contrib/python/pytest-mock/py3/pytest_mock/plugin.py b/contrib/python/pytest-mock/py3/pytest_mock/plugin.py index f4dbfc3ec6e..ef996121333 100644 --- a/contrib/python/pytest-mock/py3/pytest_mock/plugin.py +++ b/contrib/python/pytest-mock/py3/pytest_mock/plugin.py @@ -157,13 +157,16 @@ class MockerFixture: """ self._mock_cache.remove(mock) - def spy(self, obj: object, name: str) -> MockType: + def spy( + self, obj: object, name: str, duplicate_iterators: bool = False + ) -> MockType: """ Create a spy of method. It will run method normally, but it is now possible to use `mock` call features with it, like call count. :param obj: An object. :param name: A method in object. + :param duplicate_iterators: Whether to keep a copy of the returned iterator in `spy_return_iter`. :return: Spy object. """ method = getattr(obj, name) @@ -177,7 +180,7 @@ class MockerFixture: spy_obj.spy_exception = e raise else: - if isinstance(r, Iterator): + if duplicate_iterators and isinstance(r, Iterator): r, duplicated_iterator = itertools.tee(r, 2) spy_obj.spy_return_iter = duplicated_iterator else: diff --git a/contrib/python/pytest-mock/py3/tests/test_pytest_mock.py b/contrib/python/pytest-mock/py3/tests/test_pytest_mock.py index 499c0a2b99d..f484705870f 100644 --- a/contrib/python/pytest-mock/py3/tests/test_pytest_mock.py +++ b/contrib/python/pytest-mock/py3/tests/test_pytest_mock.py @@ -541,13 +541,15 @@ def test_callable_like_spy(testdir: Any, mocker: MockerFixture) -> None: @pytest.mark.parametrize("iterator", [(i for i in range(3)), iter([0, 1, 2])]) -def test_spy_return_iter(mocker: MockerFixture, iterator: Iterator[int]) -> None: +def test_spy_return_iter_duplicates_iterator_when_enabled( + mocker: MockerFixture, iterator: Iterator[int] +) -> None: class Foo: def bar(self) -> Iterator[int]: return iterator foo = Foo() - spy = mocker.spy(foo, "bar") + spy = mocker.spy(foo, "bar", duplicate_iterators=True) result = list(foo.bar()) assert result == [0, 1, 2] @@ -559,8 +561,27 @@ def test_spy_return_iter(mocker: MockerFixture, iterator: Iterator[int]) -> None assert isinstance(return_value, Iterator) [email protected]("iterator", [(i for i in range(3)), iter([0, 1, 2])]) +def test_spy_return_iter_is_not_set_when_disabled( + mocker: MockerFixture, iterator: Iterator[int] +) -> None: + class Foo: + def bar(self) -> Iterator[int]: + return iterator + + foo = Foo() + spy = mocker.spy(foo, "bar", duplicate_iterators=False) + result = list(foo.bar()) + + assert result == [0, 1, 2] + assert spy.spy_return is not None + assert spy.spy_return_iter is None + [return_value] = spy.spy_return_list + assert isinstance(return_value, Iterator) + + @pytest.mark.parametrize("iterable", [(0, 1, 2), [0, 1, 2], range(3)]) -def test_spy_return_iter_ignore_plain_iterable( +def test_spy_return_iter_ignores_plain_iterable( mocker: MockerFixture, iterable: Iterable[int] ) -> None: class Foo: @@ -568,7 +589,7 @@ def test_spy_return_iter_ignore_plain_iterable( return iterable foo = Foo() - spy = mocker.spy(foo, "bar") + spy = mocker.spy(foo, "bar", duplicate_iterators=True) result = foo.bar() assert result == iterable @@ -588,7 +609,7 @@ def test_spy_return_iter_resets(mocker: MockerFixture) -> None: return self.iterables.pop(0) foo = Foo() - spy = mocker.spy(foo, "bar") + spy = mocker.spy(foo, "bar", duplicate_iterators=True) result_iterator = list(foo.bar()) assert result_iterator == [0, 1, 2] @@ -644,7 +665,7 @@ def assert_argument_introspection(left: Any, right: Any) -> Generator[None, None expected = "\n ".join(util._compare_eq_iterable(left, right, verbose)) # type:ignore[arg-type] else: expected = "\n ".join( - util._compare_eq_iterable(left, right, lambda t, *_: t, verbose) + util._compare_eq_iterable(left, right, lambda t, *_, **__: t, verbose) # type:ignore[arg-type] ) assert expected in str(e) else: diff --git a/contrib/python/pytest-mock/py3/ya.make b/contrib/python/pytest-mock/py3/ya.make index ec2d0bd0536..42088e49795 100644 --- a/contrib/python/pytest-mock/py3/ya.make +++ b/contrib/python/pytest-mock/py3/ya.make @@ -2,7 +2,7 @@ PY3_LIBRARY() -VERSION(3.15.0) +VERSION(3.15.1) LICENSE(MIT) |
