ClickHouse aggregates functions library over Apache Arrow primitives

author: chertus <azuikov@ydb.tech> 2022-08-25 16:08:44 +0300
committer: chertus <azuikov@ydb.tech> 2022-08-25 16:08:44 +0300
commit: ea542b16aeaff26273efaba14569b1d35a24a0ee (patch)
tree: da30600f503c4f4d08f9b2194f1b4f1438a30fd0
parent: a9cf4253d01d3c7ef62eeb9e798db23942ef1bcd (diff)
download: ydb-ea542b16aeaff26273efaba14569b1d35a24a0ee.tar.gz
53 files changed, 12871 insertions, 0 deletions
diff --git a/CMakeLists.darwin.txt b/CMakeLists.darwin.txt
index c55bc4077e2..a2f25521c2f 100644
--- a/CMakeLists.darwin.txt
+++ b/CMakeLists.darwin.txt
@@ -1043,6 +1043,10 @@ add_subdirectory(ydb/core/filestore)
 add_subdirectory(ydb/core/grpc_caching)
 add_subdirectory(ydb/core/pgproxy)
 add_subdirectory(ydb/core/yql_testlib)
+add_subdirectory(ydb/library/arrow_clickhouse)
+add_subdirectory(ydb/library/arrow_clickhouse/Common)
+add_subdirectory(ydb/library/arrow_clickhouse/Columns)
+add_subdirectory(ydb/library/arrow_clickhouse/DataStreams)
 add_subdirectory(ydb/core/actorlib_impl/ut)
 add_subdirectory(library/cpp/testing/unittest_main)
 add_subdirectory(library/cpp/terminate_handler)
@@ -1109,6 +1113,7 @@ add_subdirectory(ydb/core/wrappers/ut)
 add_subdirectory(ydb/core/ydb_convert/ut)
 add_subdirectory(ydb/core/ymq/ut)
 add_subdirectory(ydb/library/aclib/ut)
+add_subdirectory(ydb/library/arrow_clickhouse/ut)
 add_subdirectory(ydb/library/backup/ut)
 add_subdirectory(ydb/library/binary_json/ut)
 add_subdirectory(ydb/library/dynumber/ut)
diff --git a/CMakeLists.linux.txt b/CMakeLists.linux.txt
index 325b788019c..a323ecbd724 100644
--- a/CMakeLists.linux.txt
+++ b/CMakeLists.linux.txt
@@ -1047,6 +1047,10 @@ add_subdirectory(ydb/core/filestore)
 add_subdirectory(ydb/core/grpc_caching)
 add_subdirectory(ydb/core/pgproxy)
 add_subdirectory(ydb/core/yql_testlib)
+add_subdirectory(ydb/library/arrow_clickhouse)
+add_subdirectory(ydb/library/arrow_clickhouse/Common)
+add_subdirectory(ydb/library/arrow_clickhouse/Columns)
+add_subdirectory(ydb/library/arrow_clickhouse/DataStreams)
 add_subdirectory(ydb/core/actorlib_impl/ut)
 add_subdirectory(library/cpp/testing/unittest_main)
 add_subdirectory(library/cpp/terminate_handler)
@@ -1113,6 +1117,7 @@ add_subdirectory(ydb/core/wrappers/ut)
 add_subdirectory(ydb/core/ydb_convert/ut)
 add_subdirectory(ydb/core/ymq/ut)
 add_subdirectory(ydb/library/aclib/ut)
+add_subdirectory(ydb/library/arrow_clickhouse/ut)
 add_subdirectory(ydb/library/backup/ut)
 add_subdirectory(ydb/library/binary_json/ut)
 add_subdirectory(ydb/library/dynumber/ut)
diff --git a/ydb/library/arrow_clickhouse/AggregateFunctions/AggregateFunctionAvg.h b/ydb/library/arrow_clickhouse/AggregateFunctions/AggregateFunctionAvg.h
new file mode 100644
index 00000000000..3e8bce5fdbb
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/AggregateFunctions/AggregateFunctionAvg.h
@@ -0,0 +1,198 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+#include "arrow_clickhouse_types.h"
+
+#include <type_traits>
+
+#include <AggregateFunctions/IAggregateFunction.h>
+#include <AggregateFunctions/AggregateFunctionSum.h>
+
+namespace CH
+{
+
+/**
+ * Helper class to encapsulate values conversion for avg and avgWeighted.
+ */
+template <typename Numerator, typename Denominator>
+struct AvgFraction
+{
+    Numerator numerator{0};
+    Denominator denominator{0};
+
+    double divide() const
+    {
+        return static_cast<double>(numerator) / denominator;
+    }
+};
+
+
+/**
+ * @tparam Derived When deriving from this class, use the child class name as in CRTP, e.g.
+ *         class Self : Agg<char, bool, bool, Self>.
+ */
+template <typename TNumerator, typename TDenominator, typename Derived>
+class AggregateFunctionAvgBase : public
+        IAggregateFunctionDataHelper<AvgFraction<TNumerator, TDenominator>, Derived>
+{
+public:
+    using Base = IAggregateFunctionDataHelper<AvgFraction<TNumerator, TDenominator>, Derived>;
+    using Numerator = TNumerator;
+    using Denominator = TDenominator;
+    using Fraction = AvgFraction<Numerator, Denominator>;
+
+    explicit AggregateFunctionAvgBase(const DataTypes & argument_types_,
+        UInt32 num_scale_ = 0, UInt32 denom_scale_ = 0)
+        : Base(argument_types_, {}), num_scale(num_scale_), denom_scale(denom_scale_) {}
+
+    DataTypePtr getReturnType() const override
+    {
+        return std::make_shared<arrow::DoubleType>();
+    }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        this->data(place).numerator += this->data(rhs).numerator;
+        this->data(place).denominator += this->data(rhs).denominator;
+    }
+#if 0
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        writeBinary(this->data(place).numerator, buf);
+
+        if constexpr (std::is_unsigned_v<Denominator>)
+            writeVarUInt(this->data(place).denominator, buf);
+        else /// Floating point denominator type can be used
+            writeBinary(this->data(place).denominator, buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    {
+        readBinary(this->data(place).numerator, buf);
+
+        if constexpr (std::is_unsigned_v<Denominator>)
+            readVarUInt(this->data(place).denominator, buf);
+        else /// Floating point denominator type can be used
+            readBinary(this->data(place).denominator, buf);
+    }
+#endif
+    void insertResultInto(AggregateDataPtr __restrict place, MutableColumn & to, Arena *) const override
+    {
+        assert_cast<MutableColumnFloat64 &>(to).Append(this->data(place).divide()).ok();
+    }
+
+private:
+    UInt32 num_scale;
+    UInt32 denom_scale;
+};
+
+template <typename T>
+using AvgFieldType = std::conditional_t<std::is_floating_point_v<T>, T, UInt64>;
+
+template <typename T>
+class AggregateFunctionAvg : public AggregateFunctionAvgBase<AvgFieldType<T>, UInt64, AggregateFunctionAvg<T>>
+{
+public:
+    using Base = AggregateFunctionAvgBase<AvgFieldType<T>, UInt64, AggregateFunctionAvg<T>>;
+    using Base::Base;
+
+    using Numerator = typename Base::Numerator;
+    using Denominator = typename Base::Denominator;
+    using Fraction = typename Base::Fraction;
+
+    using ColumnType = arrow::NumericArray<T>;
+    using MutableColumnType = arrow::NumericBuilder<T>;
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const final
+    {
+        increment(place, static_cast<const ColumnType &>(*columns[0]).Value(row_num));
+        ++this->data(place).denominator;
+    }
+
+    void addBatchSinglePlace(
+        size_t row_begin,
+        size_t row_end,
+        AggregateDataPtr __restrict place,
+        const IColumn ** columns,
+        Arena *,
+        ssize_t if_argument_pos) const final
+    {
+        AggregateFunctionSumData<Numerator> sum_data;
+        const auto & column = assert_cast<const ColumnType &>(*columns[0]);
+        if (if_argument_pos >= 0)
+        {
+            const auto & flags = assert_cast<const ColumnUInt8 &>(*columns[if_argument_pos]).raw_values();
+            sum_data.addManyConditional(column.raw_values(), flags, row_begin, row_end);
+            this->data(place).denominator += countBytesInFilter(flags, row_begin, row_end);
+        }
+        else
+        {
+            sum_data.addMany(column.raw_values(), row_begin, row_end);
+            this->data(place).denominator += (row_end - row_begin);
+        }
+        increment(place, sum_data.sum);
+    }
+
+private:
+    void increment(AggregateDataPtr __restrict place, Numerator inc) const
+    {
+        this->data(place).numerator += inc;
+    }
+};
+
+class WrappedAvg final : public ArrowAggregateFunctionWrapper
+{
+public:
+    WrappedAvg(std::string name)
+        : ArrowAggregateFunctionWrapper(std::move(name))
+    {}
+
+    AggregateFunctionPtr getHouseFunction(const DataTypes & argument_types) override
+    {
+        return createWithSameType<AggregateFunctionAvg>(argument_types);
+    }
+
+    template <template <typename> typename AggFunc>
+    std::shared_ptr<IAggregateFunction> createWithSameType(const DataTypes & argument_types)
+    {
+        if (argument_types.size() != 1)
+            return {};
+
+        const DataTypePtr & type = argument_types[0];
+
+        switch (type->id()) {
+            case arrow::Type::INT8:
+                return std::make_shared<AggFunc<arrow::Int8Type>>(argument_types);
+            case arrow::Type::INT16:
+                return std::make_shared<AggFunc<arrow::Int16Type>>(argument_types);
+            case arrow::Type::INT32:
+                return std::make_shared<AggFunc<arrow::Int32Type>>(argument_types);
+            case arrow::Type::INT64:
+                return std::make_shared<AggFunc<arrow::Int64Type>>(argument_types);
+            case arrow::Type::UINT8:
+                return std::make_shared<AggFunc<arrow::UInt8Type>>(argument_types);
+            case arrow::Type::UINT16:
+                return std::make_shared<AggFunc<arrow::UInt16Type>>(argument_types);
+            case arrow::Type::UINT32:
+                return std::make_shared<AggFunc<arrow::UInt32Type>>(argument_types);
+            case arrow::Type::UINT64:
+                return std::make_shared<AggFunc<arrow::UInt64Type>>(argument_types);
+            case arrow::Type::FLOAT:
+                return std::make_shared<AggFunc<arrow::FloatType>>(argument_types);
+            case arrow::Type::DOUBLE:
+                return std::make_shared<AggFunc<arrow::DoubleType>>(argument_types);
+            case arrow::Type::DURATION:
+                return std::make_shared<AggFunc<arrow::DurationType>>(argument_types);
+            default:
+                break;
+        }
+
+        return {};
+    }
+};
+
+}
diff --git a/ydb/library/arrow_clickhouse/AggregateFunctions/AggregateFunctionCount.h b/ydb/library/arrow_clickhouse/AggregateFunctions/AggregateFunctionCount.h
new file mode 100644
index 00000000000..6044df828e6
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/AggregateFunctions/AggregateFunctionCount.h
@@ -0,0 +1,93 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+#include "arrow_clickhouse_types.h"
+
+#include <AggregateFunctions/IAggregateFunction.h>
+#include <Columns/ColumnsCommon.h>
+
+#include <array>
+
+namespace CH
+{
+
+
+struct AggregateFunctionCountData
+{
+    UInt64 count = 0;
+};
+
+
+/// Simply count number of calls.
+class AggregateFunctionCount final : public IAggregateFunctionDataHelper<AggregateFunctionCountData, AggregateFunctionCount>
+{
+public:
+    AggregateFunctionCount(const DataTypes & argument_types_)
+        : IAggregateFunctionDataHelper(argument_types_, {})
+    {}
+
+    DataTypePtr getReturnType() const override
+    {
+        return std::make_shared<DataTypeUInt64>();
+    }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void add(AggregateDataPtr __restrict place, const IColumn **, size_t, Arena *) const override
+    {
+        ++data(place).count;
+    }
+
+    void addBatchSinglePlace(
+        size_t row_begin,
+        size_t row_end,
+        AggregateDataPtr __restrict place,
+        const IColumn ** columns,
+        Arena *,
+        ssize_t if_argument_pos) const override
+    {
+        if (if_argument_pos >= 0)
+        {
+            const auto & filter_column = assert_cast<const ColumnUInt8 &>(*columns[if_argument_pos]);
+            const auto & flags = filter_column.raw_values();
+            data(place).count += countBytesInFilter(flags, row_begin, row_end);
+        }
+        else
+        {
+            data(place).count += row_end - row_begin;
+        }
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        data(place).count += data(rhs).count;
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, MutableColumn & to, Arena *) const override
+    {
+        assert_cast<MutableColumnUInt64 &>(to).Append(data(place).count).ok();
+    }
+
+    /// Reset the state to specified value. This function is not the part of common interface.
+    void set(AggregateDataPtr __restrict place, UInt64 new_count) const
+    {
+        data(place).count = new_count;
+    }
+};
+
+class WrappedCount final : public ArrowAggregateFunctionWrapper
+{
+public:
+    WrappedCount(std::string name)
+        : ArrowAggregateFunctionWrapper(std::move(name))
+    {}
+
+    AggregateFunctionPtr getHouseFunction(const DataTypes & argument_types) override
+    {
+        return std::make_shared<AggregateFunctionCount>(argument_types);
+    }
+};
+
+}
diff --git a/ydb/library/arrow_clickhouse/AggregateFunctions/AggregateFunctionMinMaxAny.h b/ydb/library/arrow_clickhouse/AggregateFunctions/AggregateFunctionMinMaxAny.h
new file mode 100644
index 00000000000..5fbfb53170d
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/AggregateFunctions/AggregateFunctionMinMaxAny.h
@@ -0,0 +1,682 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+#include "arrow_clickhouse_types.h"
+
+#include <AggregateFunctions/IAggregateFunction.h>
+
+
+namespace CH
+{
+
+/// For numeric values.
+template <typename ArrowType>
+struct SingleValueDataFixed
+{
+private:
+    using Self = SingleValueDataFixed;
+    using ColumnType = arrow::NumericArray<ArrowType>;
+    using MutableColumnType = arrow::NumericBuilder<ArrowType>;
+
+    bool has_value = false; /// We need to remember if at least one value has been passed. This is necessary for AggregateFunctionIf.
+    typename arrow::TypeTraits<ArrowType>::CType value;
+
+public:
+    static constexpr bool is_any = false;
+
+    bool has() const
+    {
+        return has_value;
+    }
+
+    void insertResultInto(MutableColumn & to) const
+    {
+        if (has())
+            assert_cast<MutableColumnType &>(to).Append(value).ok();
+        else
+            assert_cast<MutableColumnType &>(to).AppendEmptyValue().ok();
+    }
+
+    void change(const IColumn & column, size_t row_num, Arena *)
+    {
+        has_value = true;
+        value = assert_cast<const ColumnType &>(column).Value(row_num);
+    }
+
+    /// Assuming to.has()
+    void change(const Self & to, Arena *)
+    {
+        has_value = true;
+        value = to.value;
+    }
+
+    bool changeFirstTime(const IColumn & column, size_t row_num, Arena * arena)
+    {
+        if (!has())
+        {
+            change(column, row_num, arena);
+            return true;
+        }
+        else
+            return false;
+    }
+
+    bool changeFirstTime(const Self & to, Arena * arena)
+    {
+        if (!has() && to.has())
+        {
+            change(to, arena);
+            return true;
+        }
+        else
+            return false;
+    }
+
+    bool changeEveryTime(const IColumn & column, size_t row_num, Arena * arena)
+    {
+        change(column, row_num, arena);
+        return true;
+    }
+
+    bool changeEveryTime(const Self & to, Arena * arena)
+    {
+        if (to.has())
+        {
+            change(to, arena);
+            return true;
+        }
+        else
+            return false;
+    }
+
+    bool changeIfLess(const IColumn & column, size_t row_num, Arena * arena)
+    {
+        if (!has() || assert_cast<const ColumnType &>(column).Value(row_num) < value)
+        {
+            change(column, row_num, arena);
+            return true;
+        }
+        else
+            return false;
+    }
+
+    bool changeIfLess(const Self & to, Arena * arena)
+    {
+        if (to.has() && (!has() || to.value < value))
+        {
+            change(to, arena);
+            return true;
+        }
+        else
+            return false;
+    }
+
+    bool changeIfGreater(const IColumn & column, size_t row_num, Arena * arena)
+    {
+        if (!has() || assert_cast<const ColumnType &>(column).Value(row_num) > value)
+        {
+            change(column, row_num, arena);
+            return true;
+        }
+        else
+            return false;
+    }
+
+    bool changeIfGreater(const Self & to, Arena * arena)
+    {
+        if (to.has() && (!has() || to.value > value))
+        {
+            change(to, arena);
+            return true;
+        }
+        else
+            return false;
+    }
+
+    bool isEqualTo(const Self & to) const
+    {
+        return has() && to.value == value;
+    }
+
+    bool isEqualTo(const IColumn & column, size_t row_num) const
+    {
+        return has() && assert_cast<const ColumnType &>(column).Value(row_num) == value;
+    }
+
+    static bool allocatesMemoryInArena()
+    {
+        return false;
+    }
+};
+
+
+/** For strings. Short strings are stored in the object itself, and long strings are allocated separately.
+  * NOTE It could also be suitable for arrays of numbers.
+  */
+template <bool is_utf8_string>
+struct SingleValueDataString
+{
+private:
+    using Self = SingleValueDataString<is_utf8_string>;
+    using ColumnType = std::conditional_t<is_utf8_string, ColumnString, ColumnBinary>;
+    using MutableColumnType = std::conditional_t<is_utf8_string, MutableColumnString, MutableColumnBinary>;
+
+    Int32 size = -1;    /// -1 indicates that there is no value.
+    Int32 capacity = 0;    /// power of two or zero
+    char * large_data;
+
+public:
+    static constexpr Int32 AUTOMATIC_STORAGE_SIZE = 64;
+    static constexpr Int32 MAX_SMALL_STRING_SIZE = AUTOMATIC_STORAGE_SIZE - sizeof(size) - sizeof(capacity) - sizeof(large_data);
+
+private:
+    char small_data[MAX_SMALL_STRING_SIZE]; /// Including the terminating zero.
+
+public:
+    static constexpr bool is_any = false;
+
+    bool has() const
+    {
+        return size >= 0;
+    }
+
+    const char * getData() const
+    {
+        return size <= MAX_SMALL_STRING_SIZE ? small_data : large_data;
+    }
+
+    arrow::util::string_view getStringView() const
+    {
+        if (!has())
+            return {};
+        return arrow::util::string_view(getData(), size);
+    }
+
+    void insertResultInto(MutableColumn & to) const
+    {
+        if (has())
+            assert_cast<MutableColumnType &>(to).Append(getData(), size).ok();
+        else
+            assert_cast<MutableColumnType &>(to).AppendEmptyValue().ok();
+    }
+
+    /// Assuming to.has()
+    void changeImpl(arrow::util::string_view value, Arena * arena)
+    {
+        Int32 value_size = value.size();
+
+        if (value_size <= MAX_SMALL_STRING_SIZE)
+        {
+            /// Don't free large_data here.
+            size = value_size;
+
+            if (size > 0)
+                memcpy(small_data, value.data(), size);
+        }
+        else
+        {
+            if (capacity < value_size)
+            {
+                /// Don't free large_data here.
+                capacity = roundUpToPowerOfTwoOrZero(value_size);
+                large_data = arena->alloc(capacity);
+            }
+
+            size = value_size;
+            memcpy(large_data, value.data(), size);
+        }
+    }
+
+    void change(const IColumn & column, size_t row_num, Arena * arena)
+    {
+        changeImpl(assert_cast<const ColumnType &>(column).Value(row_num), arena);
+    }
+
+    void change(const Self & to, Arena * arena)
+    {
+        changeImpl(to.getStringView(), arena);
+    }
+
+    bool changeFirstTime(const IColumn & column, size_t row_num, Arena * arena)
+    {
+        if (!has())
+        {
+            change(column, row_num, arena);
+            return true;
+        }
+        else
+            return false;
+    }
+
+    bool changeFirstTime(const Self & to, Arena * arena)
+    {
+        if (!has() && to.has())
+        {
+            change(to, arena);
+            return true;
+        }
+        else
+            return false;
+    }
+
+    bool changeEveryTime(const IColumn & column, size_t row_num, Arena * arena)
+    {
+        change(column, row_num, arena);
+        return true;
+    }
+
+    bool changeEveryTime(const Self & to, Arena * arena)
+    {
+        if (to.has())
+        {
+            change(to, arena);
+            return true;
+        }
+        else
+            return false;
+    }
+
+    bool changeIfLess(const IColumn & column, size_t row_num, Arena * arena)
+    {
+        if (!has() || assert_cast<const ColumnType &>(column).Value(row_num) < getStringView())
+        {
+            change(column, row_num, arena);
+            return true;
+        }
+        else
+            return false;
+    }
+
+    bool changeIfLess(const Self & to, Arena * arena)
+    {
+        if (to.has() && (!has() || to.getStringView() < getStringView()))
+        {
+            change(to, arena);
+            return true;
+        }
+        else
+            return false;
+    }
+
+    bool changeIfGreater(const IColumn & column, size_t row_num, Arena * arena)
+    {
+        if (!has() || assert_cast<const ColumnType &>(column).Value(row_num) > getStringView())
+        {
+            change(column, row_num, arena);
+            return true;
+        }
+        else
+            return false;
+    }
+
+    bool changeIfGreater(const Self & to, Arena * arena)
+    {
+        if (to.has() && (!has() || to.getStringView() > getStringView()))
+        {
+            change(to, arena);
+            return true;
+        }
+        else
+            return false;
+    }
+
+    bool isEqualTo(const Self & to) const
+    {
+        return has() && to.getStringView() == getStringView();
+    }
+
+    bool isEqualTo(const IColumn & column, size_t row_num) const
+    {
+        return has() && assert_cast<const ColumnType &>(column).Value(row_num) == getStringView();
+    }
+
+    static bool allocatesMemoryInArena()
+    {
+        return true;
+    }
+};
+
+static_assert(sizeof(SingleValueDataString<false>) == SingleValueDataString<false>::AUTOMATIC_STORAGE_SIZE,
+              "Incorrect size of SingleValueDataString struct");
+static_assert(sizeof(SingleValueDataString<true>) == SingleValueDataString<true>::AUTOMATIC_STORAGE_SIZE,
+              "Incorrect size of SingleValueDataString struct");
+
+
+#if 0
+/// For any other value types.
+struct SingleValueDataGeneric
+{
+private:
+    using Self = SingleValueDataGeneric;
+    static constexpr bool is_any = false;
+
+    Field value;
+
+public:
+    bool has() const
+    {
+        return !value.isNull();
+    }
+
+    void insertResultInto(IColumn & to) const
+    {
+        if (has())
+            to.insert(value);
+        else
+            to.insertDefault();
+    }
+
+    void change(const IColumn & column, size_t row_num, Arena *)
+    {
+        column.get(row_num, value);
+    }
+
+    void change(const Self & to, Arena *)
+    {
+        value = to.value;
+    }
+
+    bool changeFirstTime(const IColumn & column, size_t row_num, Arena * arena)
+    {
+        if (!has())
+        {
+            change(column, row_num, arena);
+            return true;
+        }
+        else
+            return false;
+    }
+
+    bool changeFirstTime(const Self & to, Arena * arena)
+    {
+        if (!has() && to.has())
+        {
+            change(to, arena);
+            return true;
+        }
+        else
+            return false;
+    }
+
+    bool changeEveryTime(const IColumn & column, size_t row_num, Arena * arena)
+    {
+        change(column, row_num, arena);
+        return true;
+    }
+
+    bool changeEveryTime(const Self & to, Arena * arena)
+    {
+        if (to.has())
+        {
+            change(to, arena);
+            return true;
+        }
+        else
+            return false;
+    }
+
+    bool changeIfLess(const IColumn & column, size_t row_num, Arena * arena)
+    {
+        if (!has())
+        {
+            change(column, row_num, arena);
+            return true;
+        }
+        else
+        {
+            Field new_value;
+            column.get(row_num, new_value);
+            if (new_value < value)
+            {
+                value = new_value;
+                return true;
+            }
+            else
+                return false;
+        }
+    }
+
+    bool changeIfLess(const Self & to, Arena * arena)
+    {
+        if (to.has() && (!has() || to.value < value))
+        {
+            change(to, arena);
+            return true;
+        }
+        else
+            return false;
+    }
+
+    bool changeIfGreater(const IColumn & column, size_t row_num, Arena * arena)
+    {
+        if (!has())
+        {
+            change(column, row_num, arena);
+            return true;
+        }
+        else
+        {
+            Field new_value;
+            column.get(row_num, new_value);
+            if (new_value > value)
+            {
+                value = new_value;
+                return true;
+            }
+            else
+                return false;
+        }
+    }
+
+    bool changeIfGreater(const Self & to, Arena * arena)
+    {
+        if (to.has() && (!has() || to.value > value))
+        {
+            change(to, arena);
+            return true;
+        }
+        else
+            return false;
+    }
+
+    bool isEqualTo(const IColumn & column, size_t row_num) const
+    {
+        return has() && value == column[row_num];
+    }
+
+    bool isEqualTo(const Self & to) const
+    {
+        return has() && to.value == value;
+    }
+
+    static bool allocatesMemoryInArena()
+    {
+        return false;
+    }
+};
+#endif
+
+/** What is the difference between the aggregate functions min, max, any, anyLast
+  *  (the condition that the stored value is replaced by a new one,
+  *   as well as, of course, the name).
+  */
+
+template <typename Data>
+struct AggregateFunctionMinData : Data
+{
+    using Self = AggregateFunctionMinData;
+
+    bool changeIfBetter(const IColumn & column, size_t row_num, Arena * arena) { return this->changeIfLess(column, row_num, arena); }
+    bool changeIfBetter(const Self & to, Arena * arena)                        { return this->changeIfLess(to, arena); }
+};
+
+template <typename Data>
+struct AggregateFunctionMaxData : Data
+{
+    using Self = AggregateFunctionMaxData;
+
+    bool changeIfBetter(const IColumn & column, size_t row_num, Arena * arena) { return this->changeIfGreater(column, row_num, arena); }
+    bool changeIfBetter(const Self & to, Arena * arena)                        { return this->changeIfGreater(to, arena); }
+};
+
+template <typename Data>
+struct AggregateFunctionAnyData : Data
+{
+    using Self = AggregateFunctionAnyData;
+    static constexpr bool is_any = true;
+
+    bool changeIfBetter(const IColumn & column, size_t row_num, Arena * arena) { return this->changeFirstTime(column, row_num, arena); }
+    bool changeIfBetter(const Self & to, Arena * arena)                        { return this->changeFirstTime(to, arena); }
+};
+
+template <typename Data>
+class AggregateFunctionsSingleValue final : public IAggregateFunctionDataHelper<Data, AggregateFunctionsSingleValue<Data>>
+{
+    static constexpr bool is_any = Data::is_any;
+
+private:
+    DataTypePtr & type;
+
+public:
+    AggregateFunctionsSingleValue(const DataTypePtr & type_)
+        : IAggregateFunctionDataHelper<Data, AggregateFunctionsSingleValue<Data>>({type_}, {})
+        , type(this->argument_types[0])
+    {
+#if 0
+        if (StringRef(Data::name()) == StringRef("min")
+            || StringRef(Data::name()) == StringRef("max"))
+        {
+            if (!type->isComparable())
+                throw Exception("Illegal type " + type->getName() + " of argument of aggregate function " + getName()
+                    + " because the values of that data type are not comparable");
+        }
+#endif
+    }
+
+    DataTypePtr getReturnType() const override
+    {
+        return type;
+    }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
+    {
+        this->data(place).changeIfBetter(*columns[0], row_num, arena);
+    }
+
+    void addBatchSinglePlace(
+        size_t row_begin,
+        size_t row_end,
+        AggregateDataPtr place,
+        const IColumn ** columns,
+        Arena * arena,
+        ssize_t if_argument_pos) const override
+    {
+        if constexpr (is_any)
+            if (this->data(place).has())
+                return;
+        if (if_argument_pos >= 0)
+        {
+            const auto & flags = assert_cast<const ColumnUInt8 &>(*columns[if_argument_pos]).raw_values();
+            for (size_t i = row_begin; i < row_end; ++i)
+            {
+                if (flags[i])
+                {
+                    this->data(place).changeIfBetter(*columns[0], i, arena);
+                    if constexpr (is_any)
+                        break;
+                }
+            }
+        }
+        else
+        {
+            for (size_t i = row_begin; i < row_end; ++i)
+            {
+                this->data(place).changeIfBetter(*columns[0], i, arena);
+                if constexpr (is_any)
+                    break;
+            }
+        }
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
+    {
+        this->data(place).changeIfBetter(this->data(rhs), arena);
+    }
+
+    bool allocatesMemoryInArena() const override
+    {
+        return Data::allocatesMemoryInArena();
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, MutableColumn & to, Arena *) const override
+    {
+        this->data(place).insertResultInto(to);
+    }
+};
+
+template <template <typename> typename AggFunc, template <typename> typename AggData>
+inline std::shared_ptr<IAggregateFunction> createAggregateFunctionSingleValue(const DataTypes & argument_types)
+{
+    if (argument_types.size() != 1)
+        return {};
+
+    const DataTypePtr & argument_type = argument_types[0];
+
+    switch (argument_type->id()) {
+        case arrow::Type::INT8:
+            return std::make_shared<AggFunc<AggData<SingleValueDataFixed<arrow::Int8Type>>>>(argument_type);
+        case arrow::Type::INT16:
+            return std::make_shared<AggFunc<AggData<SingleValueDataFixed<arrow::Int16Type>>>>(argument_type);
+        case arrow::Type::INT32:
+            return std::make_shared<AggFunc<AggData<SingleValueDataFixed<arrow::Int32Type>>>>(argument_type);
+        case arrow::Type::INT64:
+            return std::make_shared<AggFunc<AggData<SingleValueDataFixed<arrow::Int64Type>>>>(argument_type);
+        case arrow::Type::UINT8:
+            return std::make_shared<AggFunc<AggData<SingleValueDataFixed<arrow::UInt8Type>>>>(argument_type);
+        case arrow::Type::UINT16:
+            return std::make_shared<AggFunc<AggData<SingleValueDataFixed<arrow::UInt16Type>>>>(argument_type);
+        case arrow::Type::UINT32:
+            return std::make_shared<AggFunc<AggData<SingleValueDataFixed<arrow::UInt32Type>>>>(argument_type);
+        case arrow::Type::UINT64:
+            return std::make_shared<AggFunc<AggData<SingleValueDataFixed<arrow::UInt64Type>>>>(argument_type);
+        case arrow::Type::FLOAT:
+            return std::make_shared<AggFunc<AggData<SingleValueDataFixed<arrow::FloatType>>>>(argument_type);
+        case arrow::Type::DOUBLE:
+            return std::make_shared<AggFunc<AggData<SingleValueDataFixed<arrow::DoubleType>>>>(argument_type);
+        case arrow::Type::TIMESTAMP:
+            return std::make_shared<AggFunc<AggData<SingleValueDataFixed<arrow::TimestampType>>>>(argument_type);
+        case arrow::Type::DURATION:
+            return std::make_shared<AggFunc<AggData<SingleValueDataFixed<arrow::DurationType>>>>(argument_type);
+        case arrow::Type::BINARY:
+            return std::make_shared<AggFunc<AggData<SingleValueDataString<false>>>>(argument_type);
+        case arrow::Type::STRING:
+            return std::make_shared<AggFunc<AggData<SingleValueDataString<true>>>>(argument_type);
+        default:
+            break;
+    }
+
+    //return std::make_shared<AggFunc<AggData<SingleValueDataGeneric>>>(argument_type); // TODO
+    return {};
+}
+
+template <template <typename> typename AggFunc, template <typename> typename AggData>
+class WrappedMinMaxAny final : public ArrowAggregateFunctionWrapper
+{
+public:
+    WrappedMinMaxAny(std::string name)
+        : ArrowAggregateFunctionWrapper(std::move(name))
+    {}
+
+    AggregateFunctionPtr getHouseFunction(const DataTypes & argument_types) override
+    {
+        return createAggregateFunctionSingleValue<AggFunc, AggData>(argument_types);
+    }
+};
+
+using WrappedMin = WrappedMinMaxAny<AggregateFunctionsSingleValue, AggregateFunctionMinData>;
+using WrappedMax = WrappedMinMaxAny<AggregateFunctionsSingleValue, AggregateFunctionMaxData>;
+using WrappedAny = WrappedMinMaxAny<AggregateFunctionsSingleValue, AggregateFunctionAnyData>;
+
+}
diff --git a/ydb/library/arrow_clickhouse/AggregateFunctions/AggregateFunctionSum.h b/ydb/library/arrow_clickhouse/AggregateFunctions/AggregateFunctionSum.h
new file mode 100644
index 00000000000..52844b95da2
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/AggregateFunctions/AggregateFunctionSum.h
@@ -0,0 +1,302 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+#include "arrow_clickhouse_types.h"
+
+#include <cstring>
+#include <type_traits>
+
+#include <AggregateFunctions/IAggregateFunction.h>
+
+namespace CH
+{
+
+/// Uses addOverflow method (if available) to avoid UB for sumWithOverflow()
+///
+/// Since NO_SANITIZE_UNDEFINED works only for the function itself, without
+/// callers, and in case of non-POD type (i.e. Decimal) you have overwritten
+/// operator+=(), which will have UB.
+template <typename T>
+struct AggregateFunctionSumAddOverflowImpl
+{
+    static void NO_SANITIZE_UNDEFINED ALWAYS_INLINE add(T & lhs, const T & rhs)
+    {
+        lhs += rhs;
+    }
+};
+
+template <typename T>
+struct AggregateFunctionSumData
+{
+    using Impl = AggregateFunctionSumAddOverflowImpl<T>;
+    T sum{};
+
+    void NO_SANITIZE_UNDEFINED ALWAYS_INLINE add(T value)
+    {
+        Impl::add(sum, value);
+    }
+
+    template <typename Value>
+    void NO_SANITIZE_UNDEFINED NO_INLINE addManyImpl(const Value * __restrict ptr, size_t start, size_t end) /// NOLINT
+    {
+        ptr += start;
+        size_t count = end - start;
+        const auto * end_ptr = ptr + count;
+
+        if constexpr (std::is_floating_point_v<T>)
+        {
+            /// Compiler cannot unroll this loop, do it manually.
+            /// (at least for floats, most likely due to the lack of -fassociative-math)
+
+            /// Something around the number of SSE registers * the number of elements fit in register.
+            constexpr size_t unroll_count = 128 / sizeof(T);
+            T partial_sums[unroll_count]{};
+
+            const auto * unrolled_end = ptr + (count / unroll_count * unroll_count);
+
+            while (ptr < unrolled_end)
+            {
+                for (size_t i = 0; i < unroll_count; ++i)
+                    Impl::add(partial_sums[i], ptr[i]);
+                ptr += unroll_count;
+            }
+
+            for (size_t i = 0; i < unroll_count; ++i)
+                Impl::add(sum, partial_sums[i]);
+        }
+
+        /// clang cannot vectorize the loop if accumulator is class member instead of local variable.
+        T local_sum{};
+        while (ptr < end_ptr)
+        {
+            Impl::add(local_sum, *ptr);
+            ++ptr;
+        }
+        Impl::add(sum, local_sum);
+    }
+
+    /// Vectorized version
+    template <typename Value>
+    void NO_INLINE addMany(const Value * __restrict ptr, size_t start, size_t end)
+    {
+        addManyImpl(ptr, start, end);
+    }
+
+    template <typename Value, bool add_if_zero>
+    void NO_SANITIZE_UNDEFINED NO_INLINE addManyConditionalInternalImpl(
+        const Value * __restrict ptr,
+        const uint8_t * __restrict condition_map,
+        size_t start,
+        size_t end) /// NOLINT
+    {
+        ptr += start;
+        size_t count = end - start;
+        const auto * end_ptr = ptr + count;
+
+        if constexpr (std::is_integral_v<T>)
+        {
+            /// For integers we can vectorize the operation if we replace the null check using a multiplication (by 0 for null, 1 for not null)
+            /// https://quick-bench.com/q/MLTnfTvwC2qZFVeWHfOBR3U7a8I
+            T local_sum{};
+            while (ptr < end_ptr)
+            {
+                T multiplier = !*condition_map == add_if_zero;
+                Impl::add(local_sum, *ptr * multiplier);
+                ++ptr;
+                ++condition_map;
+            }
+            Impl::add(sum, local_sum);
+            return;
+        }
+
+        if constexpr (std::is_floating_point_v<T>)
+        {
+            /// For floating point we use a similar trick as above, except that now we  reinterpret the floating point number as an unsigned
+            /// integer of the same size and use a mask instead (0 to discard, 0xFF..FF to keep)
+            static_assert(sizeof(Value) == 4 || sizeof(Value) == 8);
+            using equivalent_integer = typename std::conditional_t<sizeof(Value) == 4, UInt32, UInt64>;
+
+            constexpr size_t unroll_count = 128 / sizeof(T);
+            T partial_sums[unroll_count]{};
+
+            const auto * unrolled_end = ptr + (count / unroll_count * unroll_count);
+
+            while (ptr < unrolled_end)
+            {
+                for (size_t i = 0; i < unroll_count; ++i)
+                {
+                    equivalent_integer value;
+                    std::memcpy(&value, &ptr[i], sizeof(Value));
+                    value &= (!condition_map[i] != add_if_zero) - 1;
+                    Value d;
+                    std::memcpy(&d, &value, sizeof(Value));
+                    Impl::add(partial_sums[i], d);
+                }
+                ptr += unroll_count;
+                condition_map += unroll_count;
+            }
+
+            for (size_t i = 0; i < unroll_count; ++i)
+                Impl::add(sum, partial_sums[i]);
+        }
+
+        T local_sum{};
+        while (ptr < end_ptr)
+        {
+            if (!*condition_map == add_if_zero)
+                Impl::add(local_sum, *ptr);
+            ++ptr;
+            ++condition_map;
+        }
+        Impl::add(sum, local_sum);
+    }
+
+    /// Vectorized version
+    template <typename Value, bool add_if_zero>
+    void NO_INLINE addManyConditionalInternal(const Value * __restrict ptr, const uint8_t * __restrict condition_map, size_t start, size_t end)
+    {
+        addManyConditionalInternalImpl<Value, add_if_zero>(ptr, condition_map, start, end);
+    }
+
+    template <typename Value>
+    void ALWAYS_INLINE addManyNotNull(const Value * __restrict ptr, const uint8_t * __restrict null_map, size_t start, size_t end)
+    {
+        return addManyConditionalInternal<Value, true>(ptr, null_map, start, end);
+    }
+
+    template <typename Value>
+    void ALWAYS_INLINE addManyConditional(const Value * __restrict ptr, const uint8_t * __restrict cond_map, size_t start, size_t end)
+    {
+        return addManyConditionalInternal<Value, false>(ptr, cond_map, start, end);
+    }
+
+    void NO_SANITIZE_UNDEFINED merge(const AggregateFunctionSumData & rhs)
+    {
+        Impl::add(sum, rhs.sum);
+    }
+
+    T get() const
+    {
+        return sum;
+    }
+};
+
+
+/// Counts the sum of the numbers.
+template <typename T, typename TResult, typename Data>
+class AggregateFunctionSum final : public IAggregateFunctionDataHelper<Data, AggregateFunctionSum<T, TResult, Data>>
+{
+public:
+    using ColumnType = arrow::NumericArray<T>;
+    using MutableColumnType = arrow::NumericBuilder<T>;
+
+    explicit AggregateFunctionSum(const DataTypes & argument_types_)
+        : IAggregateFunctionDataHelper<Data, AggregateFunctionSum<T, TResult, Data>>(argument_types_, {})
+    {}
+
+    AggregateFunctionSum(const IDataType & /*data_type*/, const DataTypes & argument_types_)
+        : IAggregateFunctionDataHelper<Data, AggregateFunctionSum<T, TResult, Data>>(argument_types_, {})
+    {}
+
+    DataTypePtr getReturnType() const override
+    {
+        return std::make_shared<TResult>();
+    }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
+    {
+        const auto & column = assert_cast<const ColumnType &>(*columns[0]);
+        this->data(place).add(column.Value(row_num));
+    }
+
+    void addBatchSinglePlace(
+        size_t row_begin,
+        size_t row_end,
+        AggregateDataPtr __restrict place,
+        const IColumn ** columns,
+        Arena *,
+        ssize_t if_argument_pos) const override
+    {
+        const auto & column = assert_cast<const ColumnType &>(*columns[0]);
+        if (if_argument_pos >= 0)
+        {
+            const auto & flags = assert_cast<const ColumnUInt8 &>(*columns[if_argument_pos]).raw_values();
+            this->data(place).addManyConditional(column.raw_values(), flags, row_begin, row_end);
+        }
+        else
+        {
+            this->data(place).addMany(column.raw_values(), row_begin, row_end);
+        }
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        this->data(place).merge(this->data(rhs));
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, MutableColumn & to, Arena *) const override
+    {
+        assert_cast<MutableColumnType &>(to).Append(this->data(place).get()).ok();
+    }
+};
+
+class WrappedSum final : public ArrowAggregateFunctionWrapper
+{
+public:
+    template <typename T>
+    using AggregateFunctionSumWithOverflow =
+        AggregateFunctionSum<T, T, AggregateFunctionSumData<typename arrow::TypeTraits<T>::CType>>;
+
+    WrappedSum(std::string name)
+        : ArrowAggregateFunctionWrapper(std::move(name))
+    {}
+
+    AggregateFunctionPtr getHouseFunction(const DataTypes & argument_types) override
+    {
+        return createWithSameType<AggregateFunctionSumWithOverflow>(argument_types);
+    }
+
+    template <template <typename> typename AggFunc>
+    std::shared_ptr<IAggregateFunction> createWithSameType(const DataTypes & argument_types)
+    {
+        if (argument_types.size() != 1)
+            return {};
+
+        const DataTypePtr & type = argument_types[0];
+
+        switch (type->id()) {
+            case arrow::Type::INT8:
+                return std::make_shared<AggFunc<arrow::Int8Type>>(argument_types);
+            case arrow::Type::INT16:
+                return std::make_shared<AggFunc<arrow::Int16Type>>(argument_types);
+            case arrow::Type::INT32:
+                return std::make_shared<AggFunc<arrow::Int32Type>>(argument_types);
+            case arrow::Type::INT64:
+                return std::make_shared<AggFunc<arrow::Int64Type>>(argument_types);
+            case arrow::Type::UINT8:
+                return std::make_shared<AggFunc<arrow::UInt8Type>>(argument_types);
+            case arrow::Type::UINT16:
+                return std::make_shared<AggFunc<arrow::UInt16Type>>(argument_types);
+            case arrow::Type::UINT32:
+                return std::make_shared<AggFunc<arrow::UInt32Type>>(argument_types);
+            case arrow::Type::UINT64:
+                return std::make_shared<AggFunc<arrow::UInt64Type>>(argument_types);
+            case arrow::Type::FLOAT:
+                return std::make_shared<AggFunc<arrow::FloatType>>(argument_types);
+            case arrow::Type::DOUBLE:
+                return std::make_shared<AggFunc<arrow::DoubleType>>(argument_types);
+            case arrow::Type::DURATION:
+                return std::make_shared<AggFunc<arrow::DurationType>>(argument_types);
+            default:
+                break;
+        }
+
+        return {};
+    }
+};
+
+}
diff --git a/ydb/library/arrow_clickhouse/AggregateFunctions/IAggregateFunction.h b/ydb/library/arrow_clickhouse/AggregateFunctions/IAggregateFunction.h
new file mode 100644
index 00000000000..81c425d2494
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/AggregateFunctions/IAggregateFunction.h
@@ -0,0 +1,495 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+#include "arrow_clickhouse_types.h"
+
+#include <cstddef>
+#include <memory>
+#include <vector>
+#include <type_traits>
+
+namespace CH
+{
+
+class Arena;
+class ReadBuffer;
+class WriteBuffer;
+
+using AggregateDataPtr = char *;
+using ConstAggregateDataPtr = const char *;
+
+class IAggregateFunction;
+using AggregateFunctionPtr = std::shared_ptr<const IAggregateFunction>;
+struct AggregateFunctionProperties;
+
+/** Aggregate functions interface.
+  * Instances of classes with this interface do not contain the data itself for aggregation,
+  *  but contain only metadata (description) of the aggregate function,
+  *  as well as methods for creating, deleting and working with data.
+  * The data resulting from the aggregation (intermediate computing states) is stored in other objects
+  *  (which can be created in some memory pool),
+  *  and IAggregateFunction is the external interface for manipulating them.
+  */
+class IAggregateFunction : public std::enable_shared_from_this<IAggregateFunction>
+{
+public:
+    IAggregateFunction(const DataTypes & argument_types_, const Array & parameters_)
+        : argument_types(argument_types_), parameters(parameters_) {}
+
+    /// Get the result type.
+    virtual DataTypePtr getReturnType() const = 0;
+#if 0
+    /// Get the data type of internal state. By default it is AggregateFunction(name(params), argument_types...).
+    virtual DataTypePtr getStateType() const;
+#endif
+
+    virtual ~IAggregateFunction() = default;
+
+    /** Data manipulating functions. */
+
+    /** Create empty data for aggregation with `placement new` at the specified location.
+      * You will have to destroy them using the `destroy` method.
+      */
+    virtual void create(AggregateDataPtr __restrict place) const = 0;
+
+    /// Delete data for aggregation.
+    virtual void destroy(AggregateDataPtr __restrict place) const noexcept = 0;
+
+    /// It is not necessary to delete data.
+    virtual bool hasTrivialDestructor() const = 0;
+
+    /// Get `sizeof` of structure with data.
+    virtual size_t sizeOfData() const = 0;
+
+    /// How the data structure should be aligned.
+    virtual size_t alignOfData() const = 0;
+
+    /** Adds a value into aggregation data on which place points to.
+     *  columns points to columns containing arguments of aggregation function.
+     *  row_num is number of row which should be added.
+     *  Additional parameter arena should be used instead of standard memory allocator if the addition requires memory allocation.
+     */
+    virtual void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const = 0;
+
+    /// Merges state (on which place points to) with other state of current aggregation function.
+    virtual void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const = 0;
+#if 0
+    /// Serializes state (to transmit it over the network, for example).
+    virtual void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf) const = 0;
+
+    /// Deserializes state. This function is called only for empty (just created) states.
+    virtual void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, Arena * arena) const = 0;
+#endif
+    /// Returns true if a function requires Arena to handle own states (see add(), merge(), deserialize()).
+    virtual bool allocatesMemoryInArena() const = 0;
+
+    /// Inserts results into a column. This method might modify the state (e.g.
+    /// sort an array), so must be called once, from single thread. The state
+    /// must remain valid though, and the subsequent calls to add/merge/
+    /// insertResultInto must work correctly. This kind of call sequence occurs
+    /// in `runningAccumulate`, or when calculating an aggregate function as a
+    /// window function.
+    virtual void insertResultInto(AggregateDataPtr __restrict place, MutableColumn & to, Arena * arena) const = 0;
+
+    /** Returns true for aggregate functions of type -State
+      * They are executed as other aggregate functions, but not finalized (return an aggregation state that can be combined with another).
+      * Also returns true when the final value of this aggregate function contains State of other aggregate function inside.
+      */
+    virtual bool isState() const { return false; }
+
+    /** The inner loop that uses the function pointer is better than using the virtual function.
+      * The reason is that in the case of virtual functions GCC 5.1.2 generates code,
+      *  which, at each iteration of the loop, reloads the function address (the offset value in the virtual function table) from memory to the register.
+      * This gives a performance drop on simple queries around 12%.
+      * After the appearance of better compilers, the code can be removed.
+      */
+    using AddFunc = void (*)(const IAggregateFunction *, AggregateDataPtr, const IColumn **, size_t, Arena *);
+    virtual AddFunc getAddressOfAddFunction() const = 0;
+
+    /** Contains a loop with calls to "add" function. You can collect arguments into array "places"
+      *  and do a single call to "addBatch" for devirtualization and inlining.
+      */
+    virtual void addBatch( /// NOLINT
+        size_t row_begin,
+        size_t row_end,
+        AggregateDataPtr * places,
+        size_t place_offset,
+        const IColumn ** columns,
+        Arena * arena,
+        ssize_t if_argument_pos = -1) const = 0;
+
+    virtual void mergeBatch(
+        size_t row_begin,
+        size_t row_end,
+        AggregateDataPtr * places,
+        size_t place_offset,
+        const AggregateDataPtr * rhs,
+        Arena * arena) const = 0;
+
+    /** The same for single place.
+      */
+    virtual void addBatchSinglePlace( /// NOLINT
+        size_t row_begin,
+        size_t row_end,
+        AggregateDataPtr __restrict place,
+        const IColumn ** columns,
+        Arena * arena,
+        ssize_t if_argument_pos = -1) const = 0;
+
+    /** The case when the aggregation key is UInt8
+      * and pointers to aggregation states are stored in AggregateDataPtr[256] lookup table.
+      */
+    virtual void addBatchLookupTable8(
+        size_t row_begin,
+        size_t row_end,
+        AggregateDataPtr * places,
+        size_t place_offset,
+        std::function<void(AggregateDataPtr &)> init,
+        const UInt8 * key,
+        const IColumn ** columns,
+        Arena * arena) const = 0;
+
+    /** Insert result of aggregate function into result column with batch size.
+      * If destroy_place_after_insert is true. Then implementation of this method
+      * must destroy aggregate place if insert state into result column was successful.
+      * All places that were not inserted must be destroyed if there was exception during insert into result column.
+      */
+    virtual void insertResultIntoBatch(
+        size_t row_begin,
+        size_t row_end,
+        AggregateDataPtr * places,
+        size_t place_offset,
+        MutableColumn & to,
+        Arena * arena,
+        bool destroy_place_after_insert) const = 0;
+
+    /** Destroy batch of aggregate places.
+      */
+    virtual void destroyBatch(
+        size_t row_begin,
+        size_t row_end,
+        AggregateDataPtr * places,
+        size_t place_offset) const noexcept = 0;
+
+    const DataTypes & getArgumentTypes() const { return argument_types; }
+    const Array & getParameters() const { return parameters; }
+
+protected:
+    DataTypes argument_types;
+    Array parameters;
+};
+
+
+/// Implement method to obtain an address of 'add' function.
+template <typename Derived>
+class IAggregateFunctionHelper : public IAggregateFunction
+{
+private:
+    static void addFree(const IAggregateFunction * that, AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena * arena)
+    {
+        static_cast<const Derived &>(*that).add(place, columns, row_num, arena);
+    }
+
+public:
+    IAggregateFunctionHelper(const DataTypes & argument_types_, const Array & parameters_)
+        : IAggregateFunction(argument_types_, parameters_) {}
+
+    AddFunc getAddressOfAddFunction() const override { return &addFree; }
+
+    void addBatch( /// NOLINT
+        size_t row_begin,
+        size_t row_end,
+        AggregateDataPtr * places,
+        size_t place_offset,
+        const IColumn ** columns,
+        Arena * arena,
+        ssize_t if_argument_pos = -1) const override
+    {
+        if (if_argument_pos >= 0)
+        {
+            const auto & flags = assert_cast<const ColumnUInt8 &>(*columns[if_argument_pos]).raw_values();
+            for (size_t i = row_begin; i < row_end; ++i)
+            {
+                if (flags[i] && places[i])
+                    static_cast<const Derived *>(this)->add(places[i] + place_offset, columns, i, arena);
+            }
+        }
+        else
+        {
+            for (size_t i = row_begin; i < row_end; ++i)
+                if (places[i])
+                    static_cast<const Derived *>(this)->add(places[i] + place_offset, columns, i, arena);
+        }
+    }
+
+    void mergeBatch(
+        size_t row_begin,
+        size_t row_end,
+        AggregateDataPtr * places,
+        size_t place_offset,
+        const AggregateDataPtr * rhs,
+        Arena * arena) const override
+    {
+        for (size_t i = row_begin; i < row_end; ++i)
+            if (places[i])
+                static_cast<const Derived *>(this)->merge(places[i] + place_offset, rhs[i], arena);
+    }
+
+    void addBatchSinglePlace( /// NOLINT
+        size_t row_begin,
+        size_t row_end,
+        AggregateDataPtr __restrict place,
+        const IColumn ** columns,
+        Arena * arena,
+        ssize_t if_argument_pos = -1) const override
+    {
+        if (if_argument_pos >= 0)
+        {
+            const auto & flags = assert_cast<const ColumnUInt8 &>(*columns[if_argument_pos]).raw_values();
+            for (size_t i = row_begin; i < row_end; ++i)
+            {
+                if (flags[i])
+                    static_cast<const Derived *>(this)->add(place, columns, i, arena);
+            }
+        }
+        else
+        {
+            for (size_t i = row_begin; i < row_end; ++i)
+                static_cast<const Derived *>(this)->add(place, columns, i, arena);
+        }
+    }
+
+    void addBatchLookupTable8(
+        size_t row_begin,
+        size_t row_end,
+        AggregateDataPtr * map,
+        size_t place_offset,
+        std::function<void(AggregateDataPtr &)> init,
+        const UInt8 * key,
+        const IColumn ** columns,
+        Arena * arena) const override
+    {
+        static constexpr size_t UNROLL_COUNT = 8;
+
+        size_t i = row_begin;
+
+        size_t size_unrolled = (row_end - row_begin) / UNROLL_COUNT * UNROLL_COUNT;
+        for (; i < size_unrolled; i += UNROLL_COUNT)
+        {
+            AggregateDataPtr places[UNROLL_COUNT];
+            for (size_t j = 0; j < UNROLL_COUNT; ++j)
+            {
+                AggregateDataPtr & place = map[key[i + j]];
+                if (unlikely(!place))
+                    init(place);
+
+                places[j] = place;
+            }
+
+            for (size_t j = 0; j < UNROLL_COUNT; ++j)
+                static_cast<const Derived *>(this)->add(places[j] + place_offset, columns, i + j, arena);
+        }
+
+        for (; i < row_end; ++i)
+        {
+            AggregateDataPtr & place = map[key[i]];
+            if (unlikely(!place))
+                init(place);
+            static_cast<const Derived *>(this)->add(place + place_offset, columns, i, arena);
+        }
+    }
+
+    void insertResultIntoBatch(
+        size_t row_begin,
+        size_t row_end,
+        AggregateDataPtr * places,
+        size_t place_offset,
+        MutableColumn & to,
+        Arena * arena,
+        bool destroy_place_after_insert) const override
+    {
+        size_t batch_index = row_begin;
+
+        try
+        {
+            for (; batch_index < row_end; ++batch_index)
+            {
+                static_cast<const Derived *>(this)->insertResultInto(places[batch_index] + place_offset, to, arena);
+
+                if (destroy_place_after_insert)
+                    static_cast<const Derived *>(this)->destroy(places[batch_index] + place_offset);
+            }
+        }
+        catch (...)
+        {
+            for (size_t destroy_index = batch_index; destroy_index < row_end; ++destroy_index)
+                static_cast<const Derived *>(this)->destroy(places[destroy_index] + place_offset);
+
+            throw;
+        }
+    }
+
+    void destroyBatch(
+        size_t row_begin,
+        size_t row_end,
+        AggregateDataPtr * places,
+        size_t place_offset) const noexcept override
+    {
+        for (size_t i = row_begin; i < row_end; ++i)
+        {
+            static_cast<const Derived *>(this)->destroy(places[i] + place_offset);
+        }
+    }
+};
+
+
+/// Implements several methods for manipulation with data. T - type of structure with data for aggregation.
+template <typename T, typename Derived>
+class IAggregateFunctionDataHelper : public IAggregateFunctionHelper<Derived>
+{
+protected:
+    using Data = T;
+
+    static Data & data(AggregateDataPtr __restrict place) { return *reinterpret_cast<Data *>(place); }
+    static const Data & data(ConstAggregateDataPtr __restrict place) { return *reinterpret_cast<const Data *>(place); }
+
+public:
+    // Derived class can `override` this to flag that DateTime64 is not supported.
+    static constexpr bool DateTime64Supported = true;
+
+    IAggregateFunctionDataHelper(const DataTypes & argument_types_, const Array & parameters_)
+        : IAggregateFunctionHelper<Derived>(argument_types_, parameters_) {}
+
+    void create(AggregateDataPtr __restrict place) const override /// NOLINT
+    {
+        new (place) Data;
+    }
+
+    void destroy(AggregateDataPtr __restrict place) const noexcept override
+    {
+        data(place).~Data();
+    }
+
+    bool hasTrivialDestructor() const override
+    {
+        return std::is_trivially_destructible_v<Data>;
+    }
+
+    size_t sizeOfData() const override
+    {
+        return sizeof(Data);
+    }
+
+    size_t alignOfData() const override
+    {
+        return alignof(Data);
+    }
+
+    void addBatchLookupTable8(
+        size_t row_begin,
+        size_t row_end,
+        AggregateDataPtr * map,
+        size_t place_offset,
+        std::function<void(AggregateDataPtr &)> init,
+        const UInt8 * key,
+        const IColumn ** columns,
+        Arena * arena) const override
+    {
+        const Derived & func = *static_cast<const Derived *>(this);
+
+        /// If the function is complex or too large, use more generic algorithm.
+
+        if (func.allocatesMemoryInArena() || sizeof(Data) > 16 || func.sizeOfData() != sizeof(Data))
+        {
+            IAggregateFunctionHelper<Derived>::addBatchLookupTable8(row_begin, row_end, map, place_offset, init, key, columns, arena);
+            return;
+        }
+
+        /// Will use UNROLL_COUNT number of lookup tables.
+
+        static constexpr size_t UNROLL_COUNT = 4;
+
+        std::unique_ptr<Data[]> places{new Data[256 * UNROLL_COUNT]};
+        bool has_data[256 * UNROLL_COUNT]{}; /// Separate flags array to avoid heavy initialization.
+
+        size_t i = row_begin;
+
+        /// Aggregate data into different lookup tables.
+
+        size_t size_unrolled = (row_end - row_begin) / UNROLL_COUNT * UNROLL_COUNT;
+        for (; i < size_unrolled; i += UNROLL_COUNT)
+        {
+            for (size_t j = 0; j < UNROLL_COUNT; ++j)
+            {
+                size_t idx = j * 256 + key[i + j];
+                if (unlikely(!has_data[idx]))
+                {
+                    new (&places[idx]) Data;
+                    has_data[idx] = true;
+                }
+                func.add(reinterpret_cast<char *>(&places[idx]), columns, i + j, nullptr);
+            }
+        }
+
+        /// Merge data from every lookup table to the final destination.
+
+        for (size_t k = 0; k < 256; ++k)
+        {
+            for (size_t j = 0; j < UNROLL_COUNT; ++j)
+            {
+                size_t idx = j * 256 + k;
+                if (has_data[idx])
+                {
+                    AggregateDataPtr & place = map[k];
+                    if (unlikely(!place))
+                        init(place);
+
+                    func.merge(place + place_offset, reinterpret_cast<const char *>(&places[idx]), nullptr);
+                }
+            }
+        }
+
+        /// Process tails and add directly to the final destination.
+
+        for (; i < row_end; ++i)
+        {
+            size_t k = key[i];
+            AggregateDataPtr & place = map[k];
+            if (unlikely(!place))
+                init(place);
+
+            func.add(place + place_offset, columns, i, nullptr);
+        }
+    }
+};
+
+
+/// Properties of aggregate function that are independent of argument types and parameters.
+struct AggregateFunctionProperties
+{
+    /** When the function is wrapped with Null combinator,
+      * should we return Nullable type with NULL when no values were aggregated
+      * or we should return non-Nullable type with default value (example: count, countDistinct).
+      */
+    bool returns_default_when_only_null = false;
+
+    /** Result varies depending on the data order (example: groupArray).
+      * Some may also name this property as "non-commutative".
+      */
+    bool is_order_dependent = false;
+};
+
+
+class ArrowAggregateFunctionWrapper : public arrow::compute::ScalarAggregateFunction
+{
+public:
+    ArrowAggregateFunctionWrapper(std::string name)
+        : arrow::compute::ScalarAggregateFunction(std::move(name), arrow::compute::Arity::Unary(), nullptr)
+    {}
+
+    virtual AggregateFunctionPtr getHouseFunction(const DataTypes & argument_types) = 0;
+};
+
+}
diff --git a/ydb/library/arrow_clickhouse/AggregationCommon.h b/ydb/library/arrow_clickhouse/AggregationCommon.h
new file mode 100644
index 00000000000..ecd475eacca
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/AggregationCommon.h
@@ -0,0 +1,337 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+#include "arrow_clickhouse_types.h"
+
+#include <array>
+
+#include <Columns/ColumnsCommon.h>
+#include <Common/HashTable/Hash.h>
+#include <Common/memcpySmall.h>
+
+#include <common/StringRef.h>
+
+#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
+#include <tmmintrin.h>
+#endif
+
+
+namespace CH
+{
+
+/// When packing the values of nullable columns at a given row, we have to
+/// store the fact that these values are nullable or not. This is achieved
+/// by encoding this information as a bitmap. Let S be the size in bytes of
+/// a packed values binary blob and T the number of bytes we may place into
+/// this blob, the size that the bitmap shall occupy in the blob is equal to:
+/// ceil(T/8). Thus we must have: S = T + ceil(T/8). Below we indicate for
+/// each value of S, the corresponding value of T, and the bitmap size:
+///
+/// 32,28,4
+/// 16,14,2
+/// 8,7,1
+/// 4,3,1
+/// 2,1,1
+///
+
+namespace
+{
+
+template <typename T>
+constexpr auto getBitmapSize()
+{
+    return
+        (sizeof(T) == 32) ?
+            4 :
+        (sizeof(T) == 16) ?
+            2 :
+        ((sizeof(T) == 8) ?
+            1 :
+        ((sizeof(T) == 4) ?
+            1 :
+        ((sizeof(T) == 2) ?
+            1 :
+        0)));
+}
+
+}
+
+template<typename T, size_t step>
+void fillFixedBatch(size_t num_rows, const T * source, T * dest)
+{
+    for (size_t i = 0; i < num_rows; ++i)
+    {
+        *dest = *source;
+        ++source;
+        dest += step;
+    }
+}
+
+/// Move keys of size T into binary blob, starting from offset.
+/// It is assumed that offset is aligned to sizeof(T).
+/// Example: sizeof(key) = 16, sizeof(T) = 4, offset = 8
+/// out[0] : [--------****----]
+/// out[1] : [--------****----]
+/// ...
+template<typename T, typename Key>
+void fillFixedBatch(size_t keys_size, const ColumnRawPtrs & key_columns, const Sizes & key_sizes, PaddedPODArray<Key> & out, size_t & offset)
+{
+    for (size_t i = 0; i < keys_size; ++i)
+    {
+        if (key_sizes[i] == sizeof(T))
+        {
+            const auto * column = key_columns[i];
+            size_t num_rows = column->length();
+            out.resize_fill(num_rows);
+#if 0
+            /// Note: here we violate strict aliasing.
+            /// It should be ok as log as we do not reffer to any value from `out` before filling.
+            const char * source = assert_cast<const ColumnVectorHelper *>(column)->getRawDataBegin<sizeof(T)>();
+            T * dest = reinterpret_cast<T *>(reinterpret_cast<char *>(out.data()) + offset);
+            fillFixedBatch<T, sizeof(Key) / sizeof(T)>(num_rows, reinterpret_cast<const T *>(source), dest);
+            offset += sizeof(T);
+#else
+            T * dest = reinterpret_cast<T *>(reinterpret_cast<char *>(out.data()) + offset);
+            switch (sizeof(T))
+            {
+                case 1:
+                case 2:
+                case 4:
+                case 8:
+                {
+                    const uint8_t * source = assert_cast<const ColumnUInt8 *>(column)->raw_values();
+                    fillFixedBatch<T, sizeof(Key) / sizeof(T)>(num_rows, reinterpret_cast<const T *>(source), dest);
+                    break;
+                }
+                default:
+                {
+                    const uint8_t * source = assert_cast<const ColumnFixedString *>(column)->raw_values();
+                    fillFixedBatch<T, sizeof(Key) / sizeof(T)>(num_rows, reinterpret_cast<const T *>(source), dest);
+                    break;
+                }
+            }
+            offset += sizeof(T);
+#endif
+        }
+    }
+}
+
+/// Pack into a binary blob of type T a set of fixed-size keys. Granted that all the keys fit into the
+/// binary blob. Keys are placed starting from the longest one.
+template <typename T>
+void packFixedBatch(size_t keys_size, const ColumnRawPtrs & key_columns, const Sizes & key_sizes, PaddedPODArray<T> & out)
+{
+    size_t offset = 0;
+    fillFixedBatch<UInt128>(keys_size, key_columns, key_sizes, out, offset);
+    fillFixedBatch<UInt64>(keys_size, key_columns, key_sizes, out, offset);
+    fillFixedBatch<UInt32>(keys_size, key_columns, key_sizes, out, offset);
+    fillFixedBatch<UInt16>(keys_size, key_columns, key_sizes, out, offset);
+    fillFixedBatch<UInt8>(keys_size, key_columns, key_sizes, out, offset);
+}
+
+template <typename T>
+using KeysNullMap = std::array<UInt8, getBitmapSize<T>()>;
+
+/// Pack into a binary blob of type T a set of fixed-size keys. Granted that all the keys fit into the
+/// binary blob, they are disposed in it consecutively.
+template <typename T>
+static inline T ALWAYS_INLINE packFixed(
+    size_t i, size_t keys_size, const ColumnRawPtrs & key_columns, const Sizes & key_sizes)
+{
+    T key{};
+    char * bytes = reinterpret_cast<char *>(&key);
+    size_t offset = 0;
+
+    for (size_t j = 0; j < keys_size; ++j)
+    {
+        size_t index = i;
+        const IColumn * column = key_columns[j];
+
+        switch (key_sizes[j])
+        {
+            case 1:
+                {
+                    memcpy(bytes + offset, assert_cast<const ColumnUInt8 *>(column)->raw_values() + index, 1);
+                    offset += 1;
+                }
+                break;
+            case 2:
+                if constexpr (sizeof(T) >= 2)   /// To avoid warning about memcpy exceeding object size.
+                {
+                    memcpy(bytes + offset, assert_cast<const ColumnUInt16 *>(column)->raw_values() + index, 2);
+                    offset += 2;
+                }
+                break;
+            case 4:
+                if constexpr (sizeof(T) >= 4)
+                {
+                    memcpy(bytes + offset, assert_cast<const ColumnUInt32 *>(column)->raw_values() + index, 4);
+                    offset += 4;
+                }
+                break;
+            case 8:
+                if constexpr (sizeof(T) >= 8)
+                {
+                    memcpy(bytes + offset, assert_cast<const ColumnUInt64 *>(column)->raw_values() + index, 8);
+                    offset += 8;
+                }
+                break;
+            default:
+                memcpy(bytes + offset, assert_cast<const ColumnFixedString *>(column)->raw_values() + index * key_sizes[j], key_sizes[j]);
+                offset += key_sizes[j];
+        }
+    }
+
+    return key;
+}
+
+/// Similar as above but supports nullable values.
+template <typename T>
+static inline T ALWAYS_INLINE packFixed(
+    size_t i, size_t keys_size, const ColumnRawPtrs & key_columns, const Sizes & key_sizes,
+    const KeysNullMap<T> & bitmap)
+{
+    union
+    {
+        T key;
+        char bytes[sizeof(key)] = {};
+    };
+
+    size_t offset = 0;
+
+    static constexpr auto bitmap_size = std::tuple_size<KeysNullMap<T>>::value;
+    static constexpr bool has_bitmap = bitmap_size > 0;
+
+    if (has_bitmap)
+    {
+        memcpy(bytes + offset, bitmap.data(), bitmap_size * sizeof(UInt8));
+        offset += bitmap_size;
+    }
+
+    for (size_t j = 0; j < keys_size; ++j)
+    {
+        bool is_null;
+
+        if (!has_bitmap)
+            is_null = false;
+        else
+        {
+            size_t bucket = j / 8;
+            size_t off = j % 8;
+            is_null = ((bitmap[bucket] >> off) & 1) == 1;
+        }
+
+        if (is_null)
+            continue;
+
+        switch (key_sizes[j])
+        {
+            case 1:
+                memcpy(bytes + offset, assert_cast<const ColumnUInt8 *>(key_columns[j])->raw_values() + i, 1);
+                offset += 1;
+                break;
+            case 2:
+                memcpy(bytes + offset, assert_cast<const ColumnUInt16 *>(key_columns[j])->raw_values() + i, 2);
+                offset += 2;
+                break;
+            case 4:
+                memcpy(bytes + offset, assert_cast<const ColumnUInt32 *>(key_columns[j])->raw_values() + i, 4);
+                offset += 4;
+                break;
+            case 8:
+                memcpy(bytes + offset, assert_cast<const ColumnUInt64 *>(key_columns[j])->raw_values() + i, 8);
+                offset += 8;
+                break;
+            default:
+                memcpy(bytes + offset, assert_cast<const ColumnFixedString *>(key_columns[j])->raw_values() + i * key_sizes[j], key_sizes[j]);
+                offset += key_sizes[j];
+        }
+    }
+
+    return key;
+}
+
+
+/// Hash a set of keys into a UInt128 value.
+static inline UInt128 ALWAYS_INLINE hash128(size_t row, size_t keys_size, const ColumnRawPtrs & key_columns)
+{
+    UInt128 key;
+    SipHash hash;
+
+    for (size_t j = 0; j < keys_size; ++j)
+        updateHashWithValue(*key_columns[j], row, hash);
+
+    hash.get128(key);
+    return key;
+}
+
+
+/// Copy keys to the pool. Then put into pool StringRefs to them and return the pointer to the first.
+static inline StringRef * ALWAYS_INLINE placeKeysInPool(
+    size_t keys_size, StringRefs & keys, Arena & pool)
+{
+    for (size_t j = 0; j < keys_size; ++j)
+    {
+        char * place = pool.alloc(keys[j].size);
+        memcpySmallAllowReadWriteOverflow15(place, keys[j].data, keys[j].size);
+        keys[j].data = place;
+    }
+
+    /// Place the StringRefs on the newly copied keys in the pool.
+    char * res = pool.alignedAlloc(keys_size * sizeof(StringRef), alignof(StringRef));
+    memcpySmallAllowReadWriteOverflow15(res, keys.data(), keys_size * sizeof(StringRef));
+
+    return reinterpret_cast<StringRef *>(res);
+}
+
+
+/** Serialize keys into a continuous chunk of memory.
+  */
+static inline StringRef ALWAYS_INLINE serializeKeysToPoolContiguous(
+    size_t row, size_t keys_size, const ColumnRawPtrs & key_columns, Arena & pool)
+{
+    const char * begin = nullptr;
+
+    size_t sum_size = 0;
+    for (size_t j = 0; j < keys_size; ++j)
+        sum_size += serializeValueIntoArena(*key_columns[j], row, pool, begin).size;
+
+    return {begin, sum_size};
+}
+
+
+/** Pack elements with shuffle instruction.
+  * See the explanation in ColumnsHashing.h
+  */
+#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
+template <typename T>
+static T inline packFixedShuffle(
+    const char * __restrict * __restrict srcs,
+    size_t num_srcs,
+    const size_t * __restrict elem_sizes,
+    size_t idx,
+    const uint8_t * __restrict masks)
+{
+    assert(num_srcs > 0);
+
+    __m128i res = _mm_shuffle_epi8(
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(srcs[0] + elem_sizes[0] * idx)),
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(masks)));
+
+    for (size_t i = 1; i < num_srcs; ++i)
+    {
+        res = _mm_xor_si128(res,
+            _mm_shuffle_epi8(
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(srcs[i] + elem_sizes[i] * idx)),
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(&masks[i * sizeof(T)]))));
+    }
+
+    T out;
+    __builtin_memcpy(&out, &res, sizeof(T));
+    return out;
+}
+#endif
+
+}
diff --git a/ydb/library/arrow_clickhouse/Aggregator.cpp b/ydb/library/arrow_clickhouse/Aggregator.cpp
new file mode 100644
index 00000000000..5a6a89befe9
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Aggregator.cpp
@@ -0,0 +1,1554 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#include "Aggregator.h"
+#include <DataStreams/IBlockInputStream.h>
+
+
+namespace CH
+{
+
+AggregatedDataVariants::~AggregatedDataVariants()
+{
+    if (aggregator && !aggregator->all_aggregates_has_trivial_destructor)
+    {
+        try
+        {
+            aggregator->destroyAllAggregateStates(*this);
+        }
+        catch (...)
+        {
+            //tryLogCurrentException(__PRETTY_FUNCTION__);
+        }
+    }
+}
+
+Header Aggregator::getHeader(bool final) const
+{
+    return params.getHeader(final);
+}
+
+Header Aggregator::Params::getHeader(
+    const Header & src_header,
+    const Header & intermediate_header,
+    const ColumnNumbers & keys,
+    const AggregateDescriptions & aggregates,
+    bool final)
+{
+    ColumnsWithTypeAndName fields;
+    if (intermediate_header)
+    {
+        fields = intermediate_header->fields();
+
+        if (final)
+        {
+            for (const auto & aggregate : aggregates)
+            {
+                int agg_pos = intermediate_header->GetFieldIndex(aggregate.column_name);
+                DataTypePtr type = aggregate.function->getReturnType();
+
+                fields[agg_pos] = std::make_shared<ColumnWithTypeAndName>(aggregate.column_name, type);
+            }
+        }
+    }
+    else
+    {
+        fields.reserve(keys.size() + aggregates.size());
+
+        for (const auto & key : keys)
+            fields.push_back(src_header->field(key));
+
+        for (const auto & aggregate : aggregates)
+        {
+            size_t arguments_size = aggregate.arguments.size();
+            DataTypes argument_types(arguments_size);
+            for (size_t j = 0; j < arguments_size; ++j)
+                argument_types[j] = src_header->field(aggregate.arguments[j])->type();
+
+            DataTypePtr type;
+            if (final)
+                type = aggregate.function->getReturnType();
+            else
+                type = std::make_shared<DataTypeAggregateFunction>(
+                    aggregate.function, argument_types, aggregate.parameters);
+
+            fields.emplace_back(std::make_shared<ColumnWithTypeAndName>(aggregate.column_name, type));
+        }
+    }
+    return std::make_shared<arrow::Schema>(fields);
+}
+
+
+Aggregator::Aggregator(const Params & params_)
+    : params(params_)
+{
+    aggregate_functions.resize(params.aggregates_size);
+    for (size_t i = 0; i < params.aggregates_size; ++i)
+        aggregate_functions[i] = params.aggregates[i].function.get();
+
+    /// Initialize sizes of aggregation states and its offsets.
+    offsets_of_aggregate_states.resize(params.aggregates_size);
+    total_size_of_aggregate_states = 0;
+    all_aggregates_has_trivial_destructor = true;
+
+    // aggregate_states will be aligned as below:
+    // |<-- state_1 -->|<-- pad_1 -->|<-- state_2 -->|<-- pad_2 -->| .....
+    //
+    // pad_N will be used to match alignment requirement for each next state.
+    // The address of state_1 is aligned based on maximum alignment requirements in states
+    for (size_t i = 0; i < params.aggregates_size; ++i)
+    {
+        offsets_of_aggregate_states[i] = total_size_of_aggregate_states;
+
+        total_size_of_aggregate_states += params.aggregates[i].function->sizeOfData();
+
+        // aggregate states are aligned based on maximum requirement
+        align_aggregate_states = std::max(align_aggregate_states, params.aggregates[i].function->alignOfData());
+
+        // If not the last aggregate_state, we need pad it so that next aggregate_state will be aligned.
+        if (i + 1 < params.aggregates_size)
+        {
+            size_t alignment_of_next_state = params.aggregates[i + 1].function->alignOfData();
+            if ((alignment_of_next_state & (alignment_of_next_state - 1)) != 0)
+                throw Exception("Logical error: alignOfData is not 2^N");
+
+            /// Extend total_size to next alignment requirement
+            /// Add padding by rounding up 'total_size_of_aggregate_states' to be a multiplier of alignment_of_next_state.
+            total_size_of_aggregate_states = (total_size_of_aggregate_states + alignment_of_next_state - 1) / alignment_of_next_state * alignment_of_next_state;
+        }
+
+        if (!params.aggregates[i].function->hasTrivialDestructor())
+            all_aggregates_has_trivial_destructor = false;
+    }
+
+    method_chosen = chooseAggregationMethod();
+    HashMethodContext::Settings cache_settings;
+    cache_settings.max_threads = params.max_threads;
+    aggregation_state_cache = AggregatedDataVariants::createCache(method_chosen, cache_settings);
+}
+
+
+AggregatedDataVariants::Type Aggregator::chooseAggregationMethod()
+{
+    /// If no keys. All aggregating to single row.
+    if (params.keys_size == 0)
+        return AggregatedDataVariants::Type::without_key;
+
+    auto& header = (params.src_header ? params.src_header : params.intermediate_header);
+
+    DataTypes types;
+    types.reserve(params.keys_size);
+    for (const auto & pos : params.keys)
+        types.push_back(header->field(pos)->type());
+
+    size_t keys_bytes = 0;
+    size_t num_fixed_contiguous_keys = 0;
+
+    key_sizes.resize(params.keys_size);
+    for (size_t j = 0; j < params.keys.size(); ++j)
+    {
+        if (size_t fixed_size = fixedContiguousSize(types[j]))
+        {
+            ++num_fixed_contiguous_keys;
+            key_sizes[j] = fixed_size;
+            keys_bytes += fixed_size;
+        }
+    }
+
+    //if (has_nullable_key)
+    {
+        if (params.keys_size == num_fixed_contiguous_keys)
+        {
+            /// Pack if possible all the keys along with information about which key values are nulls
+            /// into a fixed 16- or 32-byte blob.
+            if (std::tuple_size<KeysNullMap<UInt128>>::value + keys_bytes <= 16)
+                return AggregatedDataVariants::Type::nullable_keys128;
+            if (std::tuple_size<KeysNullMap<UInt256>>::value + keys_bytes <= 32)
+                return AggregatedDataVariants::Type::nullable_keys256;
+        }
+
+        /// Fallback case.
+        return AggregatedDataVariants::Type::serialized;
+    }
+
+#if 0 // TODO: keys with explicit NOT NULL
+    /// No key has been found to be nullable.
+
+    /// Single numeric key.
+    if (params.keys_size == 1 && types[0]->isValueRepresentedByNumber())
+    {
+        size_t size_of_field = types[0]->getSizeOfValueInMemory();
+
+        if (size_of_field == 1)
+            return AggregatedDataVariants::Type::key8;
+        if (size_of_field == 2)
+            return AggregatedDataVariants::Type::key16;
+        if (size_of_field == 4)
+            return AggregatedDataVariants::Type::key32;
+        if (size_of_field == 8)
+            return AggregatedDataVariants::Type::key64;
+        if (size_of_field == 16)
+            return AggregatedDataVariants::Type::keys128;
+        if (size_of_field == 32)
+            return AggregatedDataVariants::Type::keys256;
+        throw Exception("Logical error: numeric column has sizeOfField not in 1, 2, 4, 8, 16, 32.");
+    }
+
+    if (params.keys_size == 1 && isFixedString(types[0]))
+    {
+        return AggregatedDataVariants::Type::key_fixed_string;
+    }
+
+    /// If all keys fits in N bits, will use hash table with all keys packed (placed contiguously) to single N-bit key.
+    if (params.keys_size == num_fixed_contiguous_keys)
+    {
+        if (keys_bytes <= 2)
+            return AggregatedDataVariants::Type::keys16;
+        if (keys_bytes <= 4)
+            return AggregatedDataVariants::Type::keys32;
+        if (keys_bytes <= 8)
+            return AggregatedDataVariants::Type::keys64;
+        if (keys_bytes <= 16)
+            return AggregatedDataVariants::Type::keys128;
+        if (keys_bytes <= 32)
+            return AggregatedDataVariants::Type::keys256;
+    }
+
+    /// If single string key - will use hash table with references to it. Strings itself are stored separately in Arena.
+    if (params.keys_size == 1 && isString(types[0]))
+    {
+        return AggregatedDataVariants::Type::key_string;
+    }
+
+    return AggregatedDataVariants::Type::serialized;
+#endif
+}
+
+void Aggregator::createAggregateStates(AggregateDataPtr & aggregate_data) const
+{
+    for (size_t j = 0; j < params.aggregates_size; ++j)
+    {
+        try
+        {
+            /** An exception may occur if there is a shortage of memory.
+              * In order that then everything is properly destroyed, we "roll back" some of the created states.
+              * The code is not very convenient.
+              */
+            aggregate_functions[j]->create(aggregate_data + offsets_of_aggregate_states[j]);
+        }
+        catch (...)
+        {
+            for (size_t rollback_j = 0; rollback_j < j; ++rollback_j)
+            {
+                aggregate_functions[rollback_j]->destroy(aggregate_data + offsets_of_aggregate_states[rollback_j]);
+            }
+
+            throw;
+        }
+    }
+}
+
+/** It's interesting - if you remove `noinline`, then gcc for some reason will inline this function, and the performance decreases (~ 10%).
+  * (Probably because after the inline of this function, more internal functions no longer be inlined.)
+  * Inline does not make sense, since the inner loop is entirely inside this function.
+  */
+template <typename Method>
+void NO_INLINE Aggregator::executeImpl(
+    Method & method,
+    Arena * aggregates_pool,
+    size_t row_begin,
+    size_t row_end,
+    ColumnRawPtrs & key_columns,
+    AggregateFunctionInstruction * aggregate_instructions,
+    bool no_more_keys,
+    AggregateDataPtr overflow_row) const
+{
+    typename Method::State state(key_columns, key_sizes, aggregation_state_cache);
+
+    if (!no_more_keys)
+    {
+        executeImplBatch<false>(method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, overflow_row);
+    }
+    else
+    {
+        executeImplBatch<true>(method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, overflow_row);
+    }
+}
+
+template <bool no_more_keys, typename Method>
+void NO_INLINE Aggregator::executeImplBatch(
+    Method & method,
+    typename Method::State & state,
+    Arena * aggregates_pool,
+    size_t row_begin,
+    size_t row_end,
+    AggregateFunctionInstruction * aggregate_instructions,
+    AggregateDataPtr overflow_row) const
+{
+    /// Optimization for special case when there are no aggregate functions.
+    if (params.aggregates_size == 0)
+    {
+        if constexpr (no_more_keys)
+            return;
+
+        /// For all rows.
+        AggregateDataPtr place = aggregates_pool->alloc(0);
+        for (size_t i = row_begin; i < row_end; ++i)
+            state.emplaceKey(method.data, i, *aggregates_pool).setMapped(place);
+        return;
+    }
+
+    /// Optimization for special case when aggregating by 8bit key.
+    if constexpr (!no_more_keys && std::is_same_v<Method, typename decltype(AggregatedDataVariants::key8)::element_type>)
+    {
+        //if (!has_arrays && !hasSparseArguments(aggregate_instructions))
+        {
+            for (AggregateFunctionInstruction * inst = aggregate_instructions; inst->that; ++inst)
+            {
+                inst->batch_that->addBatchLookupTable8(
+                    row_begin,
+                    row_end,
+                    reinterpret_cast<AggregateDataPtr *>(method.data.data()),
+                    inst->state_offset,
+                    [&](AggregateDataPtr & aggregate_data)
+                    {
+                        aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+                        createAggregateStates(aggregate_data);
+                    },
+                    state.getKeyData(),
+                    inst->batch_arguments,
+                    aggregates_pool);
+            }
+            return;
+        }
+    }
+
+    /// NOTE: only row_end-row_start is required, but:
+    /// - this affects only optimize_aggregation_in_order,
+    /// - this is just a pointer, so it should not be significant,
+    /// - and plus this will require other changes in the interface.
+    std::unique_ptr<AggregateDataPtr[]> places(new AggregateDataPtr[row_end]);
+
+    /// For all rows.
+    for (size_t i = row_begin; i < row_end; ++i)
+    {
+        AggregateDataPtr aggregate_data = nullptr;
+
+        if constexpr (!no_more_keys)
+        {
+            auto emplace_result = state.emplaceKey(method.data, i, *aggregates_pool);
+
+            /// If a new key is inserted, initialize the states of the aggregate functions, and possibly something related to the key.
+            if (emplace_result.isInserted())
+            {
+                /// exception-safety - if you can not allocate memory or create states, then destructors will not be called.
+                emplace_result.setMapped(nullptr);
+
+                aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+
+                {
+                    createAggregateStates(aggregate_data);
+                }
+
+                emplace_result.setMapped(aggregate_data);
+            }
+            else
+                aggregate_data = emplace_result.getMapped();
+
+            assert(aggregate_data != nullptr);
+        }
+        else
+        {
+            /// Add only if the key already exists.
+            auto find_result = state.findKey(method.data, i, *aggregates_pool);
+            if (find_result.isFound())
+                aggregate_data = find_result.getMapped();
+            else
+                aggregate_data = overflow_row;
+        }
+
+        places[i] = aggregate_data;
+    }
+
+    /// Add values to the aggregate functions.
+    for (size_t i = 0; i < aggregate_functions.size(); ++i)
+    {
+        AggregateFunctionInstruction * inst = aggregate_instructions + i;
+
+        inst->batch_that->addBatch(row_begin, row_end, places.get(), inst->state_offset, inst->batch_arguments, aggregates_pool);
+    }
+}
+
+void NO_INLINE Aggregator::executeWithoutKeyImpl(
+    AggregatedDataWithoutKey & res,
+    size_t row_begin, size_t row_end,
+    AggregateFunctionInstruction * aggregate_instructions,
+    Arena * arena) const
+{
+    if (row_begin == row_end)
+        return;
+
+    /// Adding values
+    for (size_t i = 0; i < aggregate_functions.size(); ++i)
+    {
+        AggregateFunctionInstruction * inst = aggregate_instructions + i;
+
+        inst->batch_that->addBatchSinglePlace(
+            row_begin, row_end,
+            res + inst->state_offset,
+            inst->batch_arguments,
+            arena);
+    }
+}
+
+void Aggregator::prepareAggregateInstructions(Columns columns, AggregateColumns & aggregate_columns, Columns & materialized_columns,
+    AggregateFunctionInstructions & aggregate_functions_instructions) const
+{
+    for (size_t i = 0; i < params.aggregates_size; ++i)
+        aggregate_columns[i].resize(params.aggregates[i].arguments.size());
+
+    aggregate_functions_instructions.resize(params.aggregates_size + 1);
+    aggregate_functions_instructions[params.aggregates_size].that = nullptr;
+
+    for (size_t i = 0; i < params.aggregates_size; ++i)
+    {
+        for (size_t j = 0; j < aggregate_columns[i].size(); ++j)
+        {
+            materialized_columns.push_back(columns.at(params.aggregates[i].arguments[j]));
+            aggregate_columns[i][j] = materialized_columns.back().get();
+        }
+
+        aggregate_functions_instructions[i].arguments = aggregate_columns[i].data();
+        aggregate_functions_instructions[i].state_offset = offsets_of_aggregate_states[i];
+
+        const auto * that = aggregate_functions[i];
+
+        aggregate_functions_instructions[i].that = that;
+        aggregate_functions_instructions[i].batch_arguments = aggregate_columns[i].data();
+
+        aggregate_functions_instructions[i].batch_that = that;
+    }
+}
+
+
+bool Aggregator::executeOnBlock(const Block & block,
+    AggregatedDataVariants & result,
+    ColumnRawPtrs & key_columns,
+    AggregateColumns & aggregate_columns,
+    bool & no_more_keys) const
+{
+    return executeOnBlock(block->columns(),
+        /* row_begin= */ 0, block->num_rows(),
+        result,
+        key_columns,
+        aggregate_columns,
+        no_more_keys);
+}
+
+
+bool Aggregator::executeOnBlock(Columns columns,
+    size_t row_begin, size_t row_end,
+    AggregatedDataVariants & result,
+    ColumnRawPtrs & key_columns,
+    AggregateColumns & aggregate_columns,
+    bool & no_more_keys) const
+{
+    /// `result` will destroy the states of aggregate functions in the destructor
+    result.aggregator = this;
+
+    /// How to perform the aggregation?
+    if (result.empty())
+    {
+        result.init(method_chosen);
+        result.keys_size = params.keys_size;
+        result.key_sizes = key_sizes;
+    }
+
+    /** Constant columns are not supported directly during aggregation.
+      * To make them work anyway, we materialize them.
+      */
+    Columns materialized_columns;
+
+    /// Remember the columns we will work with
+    for (size_t i = 0; i < params.keys_size; ++i)
+    {
+        materialized_columns.push_back(columns.at(params.keys[i]));
+        key_columns[i] = materialized_columns.back().get();
+    }
+
+    //NestedColumnsHolder nested_columns_holder;
+    AggregateFunctionInstructions aggregate_functions_instructions;
+    prepareAggregateInstructions(columns, aggregate_columns, materialized_columns, aggregate_functions_instructions);
+
+    if ((params.overflow_row || result.type == AggregatedDataVariants::Type::without_key) && !result.without_key)
+    {
+        AggregateDataPtr place = result.aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+        createAggregateStates(place);
+        result.without_key = place;
+    }
+
+    /// We select one of the aggregation methods and call it.
+
+    /// For the case when there are no keys (all aggregate into one row).
+    if (result.type == AggregatedDataVariants::Type::without_key)
+    {
+        executeWithoutKeyImpl(result.without_key, row_begin, row_end, aggregate_functions_instructions.data(), result.aggregates_pool);
+    }
+    else
+    {
+        /// This is where data is written that does not fit in `max_rows_to_group_by` with `group_by_overflow_mode = any`.
+        AggregateDataPtr overflow_row_ptr = params.overflow_row ? result.without_key : nullptr;
+
+        #define M(NAME) \
+            else if (result.type == AggregatedDataVariants::Type::NAME) \
+                executeImpl(*result.NAME, result.aggregates_pool, row_begin, row_end, key_columns, aggregate_functions_instructions.data(), \
+                    no_more_keys, overflow_row_ptr);
+
+        if (false) {} // NOLINT
+        APPLY_FOR_AGGREGATED_VARIANTS(M)
+        #undef M
+    }
+
+    size_t result_size = result.sizeWithoutOverflowRow();
+
+    /// Checking the constraints.
+    if (!checkLimits(result_size, no_more_keys))
+        return false;
+
+    return true;
+}
+
+
+template <typename Method>
+Block Aggregator::convertOneBucketToBlock(
+    AggregatedDataVariants & data_variants,
+    Method & method,
+    Arena * arena,
+    bool final,
+    size_t bucket) const
+{
+    Block block = prepareBlockAndFill(data_variants, final, method.data.impls[bucket].size(),
+        [bucket, &method, arena, this] (
+            MutableColumns & key_columns,
+            AggregateColumnsData & aggregate_columns,
+            MutableColumns & final_aggregate_columns,
+            bool final_)
+        {
+            convertToBlockImpl(method, method.data.impls[bucket],
+                key_columns, aggregate_columns, final_aggregate_columns, arena, final_);
+        });
+
+    //block.info.bucket_num = bucket;
+    return block;
+}
+
+
+bool Aggregator::checkLimits(size_t result_size, bool & no_more_keys) const
+{
+    if (!no_more_keys && params.max_rows_to_group_by && result_size > params.max_rows_to_group_by)
+    {
+        switch (params.group_by_overflow_mode)
+        {
+            case OverflowMode::THROW:
+                throw Exception("Limit for rows to GROUP BY exceeded");
+
+            case OverflowMode::BREAK:
+                return false;
+
+            case OverflowMode::ANY:
+                no_more_keys = true;
+                break;
+        }
+    }
+
+    return true;
+}
+
+void Aggregator::execute(const BlockInputStreamPtr & stream, AggregatedDataVariants & result)
+{
+    ColumnRawPtrs key_columns(params.keys_size);
+    AggregateColumns aggregate_columns(params.aggregates_size);
+
+    /** Used if there is a limit on the maximum number of rows in the aggregation,
+      *  and if group_by_overflow_mode == ANY.
+      * In this case, new keys are not added to the set, but aggregation is performed only by
+      *  keys that have already managed to get into the set.
+      */
+    bool no_more_keys = false;
+
+    /// Read all the data
+    while (Block block = stream->read())
+    {
+        if (!executeOnBlock(block, result, key_columns, aggregate_columns, no_more_keys))
+            break;
+    }
+
+    /// If there was no data, and we aggregate without keys, and we must return single row with the result of empty aggregation.
+    /// To do this, we pass a block with zero rows to aggregate.
+    if (result.empty() && params.keys_size == 0 && !params.empty_result_for_aggregation_by_empty_set)
+    {
+        auto emptyColumns = columnsFromHeader(stream->getHeader());
+        executeOnBlock(emptyColumns, 0, 0, result, key_columns, aggregate_columns, no_more_keys);
+    }
+}
+
+
+template <typename Method, typename Table>
+void Aggregator::convertToBlockImpl(
+    Method & method,
+    Table & data,
+    MutableColumns & key_columns,
+    AggregateColumnsData & aggregate_columns,
+    MutableColumns & final_aggregate_columns,
+    Arena * arena,
+    bool final) const
+{
+    if (data.empty())
+        return;
+
+    if (key_columns.size() != params.keys_size)
+        throw Exception{"Aggregate. Unexpected key columns size."};
+
+    if (final)
+        convertToBlockImplFinal<Method>(method, data, key_columns, final_aggregate_columns, arena);
+    else
+        convertToBlockImplNotFinal(method, data, key_columns, aggregate_columns);
+
+    /// In order to release memory early.
+    data.clearAndShrink();
+}
+
+
+template <typename Mapped>
+inline void Aggregator::insertAggregatesIntoColumns(
+    Mapped & mapped,
+    MutableColumns & final_aggregate_columns,
+    Arena * arena) const
+{
+    /** Final values of aggregate functions are inserted to columns.
+      * Then states of aggregate functions, that are not longer needed, are destroyed.
+      *
+      * We mark already destroyed states with "nullptr" in data,
+      *  so they will not be destroyed in destructor of Aggregator
+      * (other values will be destroyed in destructor in case of exception).
+      *
+      * But it becomes tricky, because we have multiple aggregate states pointed by a single pointer in data.
+      * So, if exception is thrown in the middle of moving states for different aggregate functions,
+      *  we have to catch exceptions and destroy all the states that are no longer needed,
+      *  to keep the data in consistent state.
+      *
+      * It is also tricky, because there are aggregate functions with "-State" modifier.
+      * When we call "insertResultInto" for them, they insert a pointer to the state to ColumnAggregateFunction
+      *  and ColumnAggregateFunction will take ownership of this state.
+      * So, for aggregate functions with "-State" modifier, the state must not be destroyed
+      *  after it has been transferred to ColumnAggregateFunction.
+      * But we should mark that the data no longer owns these states.
+      */
+
+    size_t insert_i = 0;
+    std::exception_ptr exception;
+
+    try
+    {
+        /// Insert final values of aggregate functions into columns.
+        for (; insert_i < params.aggregates_size; ++insert_i)
+            aggregate_functions[insert_i]->insertResultInto(
+                mapped + offsets_of_aggregate_states[insert_i],
+                *final_aggregate_columns[insert_i],
+                arena);
+    }
+    catch (...)
+    {
+        exception = std::current_exception();
+    }
+
+    /** Destroy states that are no longer needed. This loop does not throw.
+        *
+        * Don't destroy states for "-State" aggregate functions,
+        *  because the ownership of this state is transferred to ColumnAggregateFunction
+        *  and ColumnAggregateFunction will take care.
+        *
+        * But it's only for states that has been transferred to ColumnAggregateFunction
+        *  before exception has been thrown;
+        */
+    for (size_t destroy_i = 0; destroy_i < params.aggregates_size; ++destroy_i)
+    {
+        /// If ownership was not transferred to ColumnAggregateFunction.
+        if (!(destroy_i < insert_i && aggregate_functions[destroy_i]->isState()))
+            aggregate_functions[destroy_i]->destroy(
+                mapped + offsets_of_aggregate_states[destroy_i]);
+    }
+
+    /// Mark the cell as destroyed so it will not be destroyed in destructor.
+    mapped = nullptr;
+
+    if (exception)
+        std::rethrow_exception(exception);
+}
+
+
+template <typename Method, typename Table>
+void NO_INLINE Aggregator::convertToBlockImplFinal(
+    Method & method,
+    Table & data,
+    const MutableColumns & key_columns,
+    MutableColumns & final_aggregate_columns,
+    Arena * arena) const
+{
+#if 0 // TODO: enable shuffle in AggregationMethodKeysFixed
+    std::vector<MutableColumn *> raw_key_columns;
+    raw_key_columns.reserve(key_columns.size());
+    for (auto & column : key_columns)
+        raw_key_columns.push_back(column.get());
+
+    //auto shuffled_key_sizes = method.shuffleKeyColumns(key_columns, key_sizes);
+    //const auto & key_sizes_ref = shuffled_key_sizes ? *shuffled_key_sizes :  key_sizes;
+#endif
+    const auto & key_sizes_ref = key_sizes;
+
+    PaddedPODArray<AggregateDataPtr> places;
+    places.reserve(data.size());
+
+    data.forEachValue([&](const auto & key, auto & mapped)
+    {
+        method.insertKeyIntoColumns(key, key_columns, key_sizes_ref);
+        places.emplace_back(mapped);
+
+        /// Mark the cell as destroyed so it will not be destroyed in destructor.
+        mapped = nullptr;
+    });
+
+    std::exception_ptr exception;
+    size_t aggregate_functions_destroy_index = 0;
+
+    try
+    {
+        for (; aggregate_functions_destroy_index < params.aggregates_size;)
+        {
+            auto & final_aggregate_column = final_aggregate_columns[aggregate_functions_destroy_index];
+            size_t offset = offsets_of_aggregate_states[aggregate_functions_destroy_index];
+
+            /** We increase aggregate_functions_destroy_index because by function contract if insertResultIntoBatch
+              * throws exception, it also must destroy all necessary states.
+              * Then code need to continue to destroy other aggregate function states with next function index.
+              */
+            size_t destroy_index = aggregate_functions_destroy_index;
+            ++aggregate_functions_destroy_index;
+
+            /// For State AggregateFunction ownership of aggregate place is passed to result column after insert
+            bool is_state = aggregate_functions[destroy_index]->isState();
+            bool destroy_place_after_insert = !is_state;
+
+            aggregate_functions[destroy_index]->insertResultIntoBatch(0, places.size(), places.data(), offset, *final_aggregate_column, arena, destroy_place_after_insert);
+        }
+    }
+    catch (...)
+    {
+        exception = std::current_exception();
+    }
+
+    for (; aggregate_functions_destroy_index < params.aggregates_size; ++aggregate_functions_destroy_index)
+    {
+        size_t offset = offsets_of_aggregate_states[aggregate_functions_destroy_index];
+        aggregate_functions[aggregate_functions_destroy_index]->destroyBatch(0, places.size(), places.data(), offset);
+    }
+
+    if (exception)
+        std::rethrow_exception(exception);
+}
+
+template <typename Method, typename Table>
+void NO_INLINE Aggregator::convertToBlockImplNotFinal(
+    Method & method,
+    Table & data,
+    const MutableColumns & key_columns,
+    AggregateColumnsData & aggregate_columns) const
+{
+#if 0 // TODO: enable shuffle in AggregationMethodKeysFixed
+    std::vector<MutableColumn *> raw_key_columns;
+    raw_key_columns.reserve(key_columns.size());
+    for (auto & column : key_columns)
+        raw_key_columns.push_back(column.get());
+
+    //auto shuffled_key_sizes = method.shuffleKeyColumns(key_columns, key_sizes);
+    //const auto & key_sizes_ref = shuffled_key_sizes ? *shuffled_key_sizes :  key_sizes;
+#endif
+    const auto & key_sizes_ref = key_sizes;
+
+    data.forEachValue([&](const auto & key, auto & mapped)
+    {
+        method.insertKeyIntoColumns(key, key_columns, key_sizes_ref);
+
+        /// reserved, so push_back does not throw exceptions
+        for (size_t i = 0; i < params.aggregates_size; ++i)
+            aggregate_columns[i]->Append(reinterpret_cast<uint64_t>(mapped + offsets_of_aggregate_states[i])).ok();
+
+        mapped = nullptr;
+    });
+}
+
+
+template <typename Filler>
+Block Aggregator::prepareBlockAndFill(
+    AggregatedDataVariants & /*data_variants*/,
+    bool final,
+    size_t rows,
+    Filler && filler) const
+{
+    Header header = getHeader(final);
+
+    std::vector<std::shared_ptr<MutableColumnAggregateFunction>> aggregate_columns(params.aggregates_size);
+    MutableColumns final_aggregate_columns(params.aggregates_size);
+    AggregateColumnsData aggregate_columns_data(params.aggregates_size);
+
+    for (size_t i = 0; i < params.aggregates_size; ++i)
+    {
+        if (!final)
+        {
+            const auto & aggregate_column_name = params.aggregates[i].column_name;
+            auto & type = header->GetFieldByName(aggregate_column_name)->type();
+            aggregate_columns[i] = std::make_shared<MutableColumnAggregateFunction>(
+                std::static_pointer_cast<DataTypeAggregateFunction>(type)); // TODO: set pool
+
+            /// The ColumnAggregateFunction column captures the shared ownership of the arena with the aggregate function states.
+            auto & column_aggregate_func = *aggregate_columns[i];
+#if 0
+            for (auto & pool : data_variants.aggregates_pools)
+                column_aggregate_func.addArena(pool);
+#endif
+            aggregate_columns_data[i] = &column_aggregate_func.getData();
+            aggregate_columns_data[i]->Reserve(rows).ok();
+        }
+        else
+        {
+            final_aggregate_columns[i] = createMutableColumn(aggregate_functions[i]->getReturnType());
+            final_aggregate_columns[i]->Reserve(rows).ok();
+        }
+    }
+
+    MutableColumns key_columns(params.keys_size);
+    for (size_t i = 0; i < params.keys_size; ++i)
+    {
+        key_columns[i] = createMutableColumn(header->field(i)->type());
+        key_columns[i]->Reserve(rows).ok();
+    }
+
+    filler(key_columns, aggregate_columns_data, final_aggregate_columns, final);
+
+    Columns columns(params.keys_size + params.aggregates_size);
+
+    for (size_t i = 0; i < params.keys_size; ++i)
+        columns[i] = *key_columns[i]->Finish();
+
+    for (size_t i = 0; i < params.aggregates_size; ++i)
+    {
+        int pos = header->GetFieldIndex(params.aggregates[i].column_name);
+        if (final)
+            columns[pos] = *final_aggregate_columns[i]->Finish();
+        else
+            columns[pos] = *aggregate_columns[i]->Finish();
+    }
+
+    // TODO: check row == columns length()
+    return arrow::RecordBatch::Make(header, rows, columns);
+}
+
+Block Aggregator::prepareBlockAndFillWithoutKey(AggregatedDataVariants & data_variants, bool final, bool /*is_overflows*/) const
+{
+    size_t rows = 1;
+
+    auto filler = [&data_variants, this](
+        MutableColumns & key_columns,
+        AggregateColumnsData & aggregate_columns,
+        MutableColumns & final_aggregate_columns,
+        bool final_)
+    {
+        if (data_variants.type == AggregatedDataVariants::Type::without_key || params.overflow_row)
+        {
+            AggregatedDataWithoutKey & data = data_variants.without_key;
+
+            if (!data)
+                throw Exception("Wrong data variant passed.");
+
+            if (!final_)
+            {
+                for (size_t i = 0; i < params.aggregates_size; ++i)
+                    aggregate_columns[i]->Append(reinterpret_cast<uint64_t>(data + offsets_of_aggregate_states[i])).ok();
+                data = nullptr;
+            }
+            else
+            {
+                /// Always single-thread. It's safe to pass current arena from 'aggregates_pool'.
+                insertAggregatesIntoColumns(data, final_aggregate_columns, data_variants.aggregates_pool);
+            }
+
+            if (params.overflow_row)
+                for (size_t i = 0; i < params.keys_size; ++i)
+                    key_columns[i]->AppendEmptyValue().ok(); // FIXME: or AppendNull() ???
+        }
+    };
+
+    Block block = prepareBlockAndFill(data_variants, final, rows, filler);
+#if 0
+    if (is_overflows)
+        block.info.is_overflows = true;
+#endif
+    if (final)
+        destroyWithoutKey(data_variants);
+
+    return block;
+}
+
+Block Aggregator::prepareBlockAndFillSingleLevel(AggregatedDataVariants & data_variants, bool final) const
+{
+    size_t rows = data_variants.sizeWithoutOverflowRow();
+
+    auto filler = [&data_variants, this](
+        MutableColumns & key_columns,
+        AggregateColumnsData & aggregate_columns,
+        MutableColumns & final_aggregate_columns,
+        bool final_)
+    {
+    #define M(NAME) \
+        else if (data_variants.type == AggregatedDataVariants::Type::NAME) \
+            convertToBlockImpl(*data_variants.NAME, data_variants.NAME->data, \
+                key_columns, aggregate_columns, final_aggregate_columns, data_variants.aggregates_pool, final_);
+
+        if (false) {} // NOLINT
+        APPLY_FOR_AGGREGATED_VARIANTS(M)
+    #undef M
+        else
+            throw Exception("Unknown aggregated data variant.");
+    };
+
+    return prepareBlockAndFill(data_variants, final, rows, filler);
+}
+
+
+BlocksList Aggregator::convertToBlocks(AggregatedDataVariants & data_variants, bool final) const
+{
+    BlocksList blocks;
+
+    /// In what data structure is the data aggregated?
+    if (data_variants.empty())
+        return blocks;
+
+    if (data_variants.without_key)
+        blocks.emplace_back(prepareBlockAndFillWithoutKey(
+            data_variants, final, data_variants.type != AggregatedDataVariants::Type::without_key));
+
+    if (data_variants.type != AggregatedDataVariants::Type::without_key)
+    {
+        blocks.emplace_back(prepareBlockAndFillSingleLevel(data_variants, final));
+    }
+
+    if (!final)
+    {
+        /// data_variants will not destroy the states of aggregate functions in the destructor.
+        /// Now ColumnAggregateFunction owns the states.
+        data_variants.aggregator = nullptr;
+    }
+
+    size_t rows = 0;
+    //size_t bytes = 0;
+
+    for (const auto & block : blocks)
+    {
+        rows += block->num_rows();
+        //bytes += block.bytes();
+    }
+
+    return blocks;
+}
+
+
+template <typename Method, typename Table>
+void NO_INLINE Aggregator::mergeDataImpl(
+    Table & table_dst,
+    Table & table_src,
+    Arena * arena) const
+{
+    table_src.mergeToViaEmplace(table_dst, [&](AggregateDataPtr & __restrict dst, AggregateDataPtr & __restrict src, bool inserted)
+    {
+        if (!inserted)
+        {
+            {
+                for (size_t i = 0; i < params.aggregates_size; ++i)
+                    aggregate_functions[i]->merge(dst + offsets_of_aggregate_states[i], src + offsets_of_aggregate_states[i], arena);
+
+                for (size_t i = 0; i < params.aggregates_size; ++i)
+                    aggregate_functions[i]->destroy(src + offsets_of_aggregate_states[i]);
+            }
+        }
+        else
+        {
+            dst = src;
+        }
+
+        src = nullptr;
+    });
+
+    table_src.clearAndShrink();
+}
+
+
+template <typename Method, typename Table>
+void NO_INLINE Aggregator::mergeDataNoMoreKeysImpl(
+    Table & table_dst,
+    AggregatedDataWithoutKey & overflows,
+    Table & table_src,
+    Arena * arena) const
+{
+    table_src.mergeToViaFind(table_dst, [&](AggregateDataPtr dst, AggregateDataPtr & src, bool found)
+    {
+        AggregateDataPtr res_data = found ? dst : overflows;
+
+        for (size_t i = 0; i < params.aggregates_size; ++i)
+            aggregate_functions[i]->merge(
+                res_data + offsets_of_aggregate_states[i],
+                src + offsets_of_aggregate_states[i],
+                arena);
+
+        for (size_t i = 0; i < params.aggregates_size; ++i)
+            aggregate_functions[i]->destroy(src + offsets_of_aggregate_states[i]);
+
+        src = nullptr;
+    });
+    table_src.clearAndShrink();
+}
+
+template <typename Method, typename Table>
+void NO_INLINE Aggregator::mergeDataOnlyExistingKeysImpl(
+    Table & table_dst,
+    Table & table_src,
+    Arena * arena) const
+{
+    table_src.mergeToViaFind(table_dst,
+        [&](AggregateDataPtr dst, AggregateDataPtr & src, bool found)
+    {
+        if (!found)
+            return;
+
+        for (size_t i = 0; i < params.aggregates_size; ++i)
+            aggregate_functions[i]->merge(
+                dst + offsets_of_aggregate_states[i],
+                src + offsets_of_aggregate_states[i],
+                arena);
+
+        for (size_t i = 0; i < params.aggregates_size; ++i)
+            aggregate_functions[i]->destroy(src + offsets_of_aggregate_states[i]);
+
+        src = nullptr;
+    });
+    table_src.clearAndShrink();
+}
+
+
+void NO_INLINE Aggregator::mergeWithoutKeyDataImpl(
+    ManyAggregatedDataVariants & non_empty_data) const
+{
+    AggregatedDataVariantsPtr & res = non_empty_data[0];
+
+    /// We merge all aggregation results to the first.
+    for (size_t result_num = 1, size = non_empty_data.size(); result_num < size; ++result_num)
+    {
+        AggregatedDataWithoutKey & res_data = res->without_key;
+        AggregatedDataWithoutKey & current_data = non_empty_data[result_num]->without_key;
+
+        for (size_t i = 0; i < params.aggregates_size; ++i)
+            aggregate_functions[i]->merge(res_data + offsets_of_aggregate_states[i], current_data + offsets_of_aggregate_states[i], res->aggregates_pool);
+
+        for (size_t i = 0; i < params.aggregates_size; ++i)
+            aggregate_functions[i]->destroy(current_data + offsets_of_aggregate_states[i]);
+
+        current_data = nullptr;
+    }
+}
+
+
+template <typename Method>
+void NO_INLINE Aggregator::mergeSingleLevelDataImpl(
+    ManyAggregatedDataVariants & non_empty_data) const
+{
+    AggregatedDataVariantsPtr & res = non_empty_data[0];
+    bool no_more_keys = false;
+
+    /// We merge all aggregation results to the first.
+    for (size_t result_num = 1, size = non_empty_data.size(); result_num < size; ++result_num)
+    {
+        if (!checkLimits(res->sizeWithoutOverflowRow(), no_more_keys))
+            break;
+
+        AggregatedDataVariants & current = *non_empty_data[result_num];
+
+        if (!no_more_keys)
+        {
+            {
+                mergeDataImpl<Method>(
+                    getDataVariant<Method>(*res).data,
+                    getDataVariant<Method>(current).data,
+                    res->aggregates_pool);
+            }
+        }
+        else if (res->without_key)
+        {
+            mergeDataNoMoreKeysImpl<Method>(
+                getDataVariant<Method>(*res).data,
+                res->without_key,
+                getDataVariant<Method>(current).data,
+                res->aggregates_pool);
+        }
+        else
+        {
+            mergeDataOnlyExistingKeysImpl<Method>(
+                getDataVariant<Method>(*res).data,
+                getDataVariant<Method>(current).data,
+                res->aggregates_pool);
+        }
+
+        /// `current` will not destroy the states of aggregate functions in the destructor
+        current.aggregator = nullptr;
+    }
+}
+
+#define M(NAME) \
+    template void NO_INLINE Aggregator::mergeSingleLevelDataImpl<decltype(AggregatedDataVariants::NAME)::element_type>( \
+        ManyAggregatedDataVariants & non_empty_data) const;
+    APPLY_FOR_AGGREGATED_VARIANTS(M)
+#undef M
+
+template <typename Method>
+void NO_INLINE Aggregator::mergeBucketImpl(
+    ManyAggregatedDataVariants & data, Int32 bucket, Arena * arena, std::atomic<bool> * is_cancelled) const
+{
+    /// We merge all aggregation results to the first.
+    AggregatedDataVariantsPtr & res = data[0];
+    for (size_t result_num = 1, size = data.size(); result_num < size; ++result_num)
+    {
+        if (is_cancelled && is_cancelled->load(std::memory_order_seq_cst))
+            return;
+
+        AggregatedDataVariants & current = *data[result_num];
+
+        {
+            mergeDataImpl<Method>(
+                getDataVariant<Method>(*res).data.impls[bucket],
+                getDataVariant<Method>(current).data.impls[bucket],
+                arena);
+        }
+    }
+}
+
+ManyAggregatedDataVariants Aggregator::prepareVariantsToMerge(ManyAggregatedDataVariants & data_variants) const
+{
+    if (data_variants.empty())
+        throw Exception("Empty data passed to Aggregator::mergeAndConvertToBlocks.");
+
+    ManyAggregatedDataVariants non_empty_data;
+    non_empty_data.reserve(data_variants.size());
+    for (auto & data : data_variants)
+        if (!data->empty())
+            non_empty_data.push_back(data);
+
+    if (non_empty_data.empty())
+        return {};
+
+    if (non_empty_data.size() > 1)
+    {
+        /// Sort the states in descending order so that the merge is more efficient (since all states are merged into the first).
+        std::sort(non_empty_data.begin(), non_empty_data.end(),
+            [](const AggregatedDataVariantsPtr & lhs, const AggregatedDataVariantsPtr & rhs)
+            {
+                return lhs->sizeWithoutOverflowRow() > rhs->sizeWithoutOverflowRow();
+            });
+    }
+
+    AggregatedDataVariantsPtr & first = non_empty_data[0];
+
+    for (size_t i = 1, size = non_empty_data.size(); i < size; ++i)
+    {
+        if (first->type != non_empty_data[i]->type)
+            throw Exception("Cannot merge different aggregated data variants.");
+
+        /** Elements from the remaining sets can be moved to the first data set.
+          * Therefore, it must own all the arenas of all other sets.
+          */
+        first->aggregates_pools.insert(first->aggregates_pools.end(),
+            non_empty_data[i]->aggregates_pools.begin(), non_empty_data[i]->aggregates_pools.end());
+    }
+
+    return non_empty_data;
+}
+
+template <bool no_more_keys, typename Method, typename Table>
+void NO_INLINE Aggregator::mergeStreamsImplCase(
+    Block & block,
+    Arena * aggregates_pool,
+    Method & method [[maybe_unused]],
+    Table & data,
+    AggregateDataPtr overflow_row) const
+{
+    ColumnRawPtrs key_columns(params.keys_size);
+    std::vector<const ColumnAggregateFunction *> aggregate_columns(params.aggregates_size);
+
+    /// Remember the columns we will work with
+    for (size_t i = 0; i < params.keys_size; ++i)
+        key_columns[i] = block->column(i).get();
+
+    for (size_t i = 0; i < params.aggregates_size; ++i)
+    {
+        const auto & aggregate_column_name = params.aggregates[i].column_name;
+        aggregate_columns[i] = &assert_cast<const ColumnAggregateFunction &>(*block->GetColumnByName(aggregate_column_name));
+    }
+
+    typename Method::State state(key_columns, key_sizes, aggregation_state_cache);
+
+    /// For all rows.
+    size_t rows = block->num_rows();
+    std::unique_ptr<AggregateDataPtr[]> places(new AggregateDataPtr[rows]);
+
+    for (size_t i = 0; i < rows; ++i)
+    {
+        AggregateDataPtr aggregate_data = nullptr;
+
+        if (!no_more_keys)
+        {
+            auto emplace_result = state.emplaceKey(data, i, *aggregates_pool);
+            if (emplace_result.isInserted())
+            {
+                emplace_result.setMapped(nullptr);
+
+                aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+                createAggregateStates(aggregate_data);
+
+                emplace_result.setMapped(aggregate_data);
+            }
+            else
+                aggregate_data = emplace_result.getMapped();
+        }
+        else
+        {
+            auto find_result = state.findKey(data, i, *aggregates_pool);
+            if (find_result.isFound())
+                aggregate_data = find_result.getMapped();
+        }
+
+        /// aggregate_date == nullptr means that the new key did not fit in the hash table because of no_more_keys.
+
+        AggregateDataPtr value = aggregate_data ? aggregate_data : overflow_row;
+        places[i] = value;
+    }
+
+    for (size_t j = 0; j < params.aggregates_size; ++j)
+    {
+        /// Merge state of aggregate functions.
+        aggregate_functions[j]->mergeBatch(
+            0, rows,
+            places.get(), offsets_of_aggregate_states[j],
+            aggregate_columns[j]->rawData(),
+            aggregates_pool);
+    }
+
+    /// Early release memory.
+    block.reset();
+}
+
+template <typename Method, typename Table>
+void NO_INLINE Aggregator::mergeStreamsImpl(
+    Block & block,
+    Arena * aggregates_pool,
+    Method & method,
+    Table & data,
+    AggregateDataPtr overflow_row,
+    bool no_more_keys) const
+{
+    if (!no_more_keys)
+        mergeStreamsImplCase<false>(block, aggregates_pool, method, data, overflow_row);
+    else
+        mergeStreamsImplCase<true>(block, aggregates_pool, method, data, overflow_row);
+}
+
+
+void NO_INLINE Aggregator::mergeWithoutKeyStreamsImpl(
+    Block & block,
+    AggregatedDataVariants & result) const
+{
+    std::vector<const ColumnAggregateFunction *> aggregate_columns(params.aggregates_size);
+
+    /// Remember the columns we will work with
+    for (size_t i = 0; i < params.aggregates_size; ++i)
+    {
+        const auto & aggregate_column_name = params.aggregates[i].column_name;
+        aggregate_columns[i] = &assert_cast<const ColumnAggregateFunction &>(*block->GetColumnByName(aggregate_column_name));
+    }
+
+    AggregatedDataWithoutKey & res = result.without_key;
+    if (!res)
+    {
+        AggregateDataPtr place = result.aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+        createAggregateStates(place);
+        res = place;
+    }
+
+    for (size_t row = 0, rows = block->num_rows(); row < rows; ++row)
+    {
+        /// Adding Values
+        for (size_t i = 0; i < params.aggregates_size; ++i)
+            aggregate_functions[i]->merge(res + offsets_of_aggregate_states[i],
+                                          aggregate_columns[i]->rawData()[row], result.aggregates_pool);
+    }
+
+    /// Early release memory.
+    block.reset();
+}
+
+bool Aggregator::mergeOnBlock(Block block, AggregatedDataVariants & result, bool & no_more_keys) const
+{
+    /// `result` will destroy the states of aggregate functions in the destructor
+    result.aggregator = this;
+
+    /// How to perform the aggregation?
+    if (result.empty())
+    {
+        result.init(method_chosen);
+        result.keys_size = params.keys_size;
+        result.key_sizes = key_sizes;
+    }
+
+    if (result.type == AggregatedDataVariants::Type::without_key /*|| block.info.is_overflows*/)
+        mergeWithoutKeyStreamsImpl(block, result);
+
+#define M(NAME) \
+    else if (result.type == AggregatedDataVariants::Type::NAME) \
+        mergeStreamsImpl(block, result.aggregates_pool, *result.NAME, result.NAME->data, result.without_key, no_more_keys);
+
+    APPLY_FOR_AGGREGATED_VARIANTS(M)
+#undef M
+    else if (result.type != AggregatedDataVariants::Type::without_key)
+        throw Exception("Unknown aggregated data variant.");
+
+    size_t result_size = result.sizeWithoutOverflowRow();
+
+    /// Checking the constraints.
+    if (!checkLimits(result_size, no_more_keys))
+        return false;
+
+    return true;
+}
+
+void Aggregator::mergeStream(const BlockInputStreamPtr & stream, AggregatedDataVariants & result)
+{
+#if 0
+    if (isCancelled())
+        return;
+
+    /** If the remote servers used a two-level aggregation method,
+      *  then blocks will contain information about the number of the bucket.
+      * Then the calculations can be parallelized by buckets.
+      * We decompose the blocks to the bucket numbers indicated in them.
+      */
+    BucketToBlocks bucket_to_blocks;
+
+    while (Block block = stream->read())
+    {
+        if (isCancelled())
+            return;
+
+        bucket_to_blocks[block.info.bucket_num].emplace_back(std::move(block));
+    }
+
+    mergeBlocks(bucket_to_blocks, result);
+#else
+    BlocksList blocks;
+
+    while (Block block = stream->read())
+        blocks.emplace_back(std::move(block));
+
+    BucketToBlocks bucket_to_blocks;
+    bucket_to_blocks.emplace(-1, std::move(blocks));
+    mergeBlocks(std::move(bucket_to_blocks), result);
+#endif
+}
+
+void Aggregator::mergeBlocks(BucketToBlocks && bucket_to_blocks, AggregatedDataVariants & result)
+{
+    if (bucket_to_blocks.empty())
+        return;
+
+    /** `minus one` means the absence of information about the bucket
+      * - in the case of single-level aggregation, as well as for blocks with "overflowing" values.
+      * If there is at least one block with a bucket number greater or equal than zero, then there was a two-level aggregation.
+      */
+    //auto max_bucket = bucket_to_blocks.rbegin()->first;
+
+    /// result will destroy the states of aggregate functions in the destructor
+    result.aggregator = this;
+
+    result.init(method_chosen);
+    result.keys_size = params.keys_size;
+    result.key_sizes = key_sizes;
+
+    bool has_blocks_with_unknown_bucket = bucket_to_blocks.count(-1);
+
+    if (has_blocks_with_unknown_bucket)
+    {
+        bool no_more_keys = false;
+
+        BlocksList & blocks = bucket_to_blocks[-1];
+        for (Block & block : blocks)
+        {
+            if (!checkLimits(result.sizeWithoutOverflowRow(), no_more_keys))
+                break;
+
+            if (result.type == AggregatedDataVariants::Type::without_key /*|| block.info.is_overflows*/)
+                mergeWithoutKeyStreamsImpl(block, result);
+
+        #define M(NAME) \
+            else if (result.type == AggregatedDataVariants::Type::NAME) \
+                mergeStreamsImpl(block, result.aggregates_pool, *result.NAME, result.NAME->data, result.without_key, no_more_keys);
+
+            APPLY_FOR_AGGREGATED_VARIANTS(M)
+        #undef M
+            else if (result.type != AggregatedDataVariants::Type::without_key)
+                throw Exception("Unknown aggregated data variant.");
+        }
+    }
+}
+
+
+Block Aggregator::mergeBlocks(BlocksList & blocks, bool final)
+{
+    if (blocks.empty())
+        return {};
+
+#if 0
+    auto bucket_num = blocks.front().info.bucket_num;
+    bool is_overflows = blocks.front().info.is_overflows;
+#endif
+
+    /** If possible, change 'method' to some_hash64. Otherwise, leave as is.
+      * Better hash function is needed because during external aggregation,
+      *  we may merge partitions of data with total number of keys far greater than 4 billion.
+      */
+    auto merge_method = method_chosen;
+
+#define APPLY_FOR_VARIANTS_THAT_MAY_USE_BETTER_HASH_FUNCTION(M) \
+        M(key64)            \
+        M(key_string)       \
+        M(key_fixed_string) \
+        M(keys128)          \
+        M(keys256)          \
+        M(serialized)       \
+
+#define M(NAME) \
+    if (merge_method == AggregatedDataVariants::Type::NAME) \
+        merge_method = AggregatedDataVariants::Type::NAME ## _hash64; \
+
+    APPLY_FOR_VARIANTS_THAT_MAY_USE_BETTER_HASH_FUNCTION(M)
+#undef M
+
+#undef APPLY_FOR_VARIANTS_THAT_MAY_USE_BETTER_HASH_FUNCTION
+
+    /// Temporary data for aggregation.
+    AggregatedDataVariants result;
+
+    /// result will destroy the states of aggregate functions in the destructor
+    result.aggregator = this;
+
+    result.init(merge_method);
+    result.keys_size = params.keys_size;
+    result.key_sizes = key_sizes;
+
+    for (Block & block : blocks)
+    {
+#if 0
+        if (bucket_num >= 0 /*&& block.info.bucket_num != bucket_num*/)
+            bucket_num = -1;
+#endif
+        if (result.type == AggregatedDataVariants::Type::without_key /*|| is_overflows*/)
+            mergeWithoutKeyStreamsImpl(block, result);
+
+    #define M(NAME) \
+        else if (result.type == AggregatedDataVariants::Type::NAME) \
+            mergeStreamsImpl(block, result.aggregates_pool, *result.NAME, result.NAME->data, nullptr, false);
+
+        APPLY_FOR_AGGREGATED_VARIANTS(M)
+    #undef M
+        else if (result.type != AggregatedDataVariants::Type::without_key)
+            throw Exception("Unknown aggregated data variant.");
+    }
+
+    Block block;
+    if (result.type == AggregatedDataVariants::Type::without_key /*|| is_overflows*/)
+        block = prepareBlockAndFillWithoutKey(result, final /*, is_overflows*/);
+    else
+        block = prepareBlockAndFillSingleLevel(result, final);
+    /// NOTE: two-level data is not possible here - chooseAggregationMethod chooses only among single-level methods.
+
+    if (!final)
+    {
+        /// Pass ownership of aggregate function states from result to ColumnAggregateFunction objects in the resulting block.
+        result.aggregator = nullptr;
+    }
+
+    //block.info.bucket_num = bucket_num;
+    return block;
+}
+
+
+template <typename Method, typename Table>
+void NO_INLINE Aggregator::destroyImpl(Table & table) const
+{
+    table.forEachMapped([&](AggregateDataPtr & data)
+    {
+        /** If an exception (usually a lack of memory, the MemoryTracker throws) arose
+          *  after inserting the key into a hash table, but before creating all states of aggregate functions,
+          *  then data will be equal nullptr.
+          */
+        if (nullptr == data)
+            return;
+
+        for (size_t i = 0; i < params.aggregates_size; ++i)
+            aggregate_functions[i]->destroy(data + offsets_of_aggregate_states[i]);
+
+        data = nullptr;
+    });
+}
+
+
+void Aggregator::destroyWithoutKey(AggregatedDataVariants & result) const
+{
+    AggregatedDataWithoutKey & res_data = result.without_key;
+
+    if (nullptr != res_data)
+    {
+        for (size_t i = 0; i < params.aggregates_size; ++i)
+            aggregate_functions[i]->destroy(res_data + offsets_of_aggregate_states[i]);
+
+        res_data = nullptr;
+    }
+}
+
+
+void Aggregator::destroyAllAggregateStates(AggregatedDataVariants & result) const
+{
+    if (result.empty())
+        return;
+
+    /// In what data structure is the data aggregated?
+    if (result.type == AggregatedDataVariants::Type::without_key || params.overflow_row)
+        destroyWithoutKey(result);
+
+#define M(NAME) \
+    else if (result.type == AggregatedDataVariants::Type::NAME) \
+        destroyImpl<decltype(result.NAME)::element_type>(result.NAME->data);
+
+    if (false) {} // NOLINT
+    APPLY_FOR_AGGREGATED_VARIANTS(M)
+#undef M
+    else if (result.type != AggregatedDataVariants::Type::without_key)
+        throw Exception("Unknown aggregated data variant.");
+}
+
+}
diff --git a/ydb/library/arrow_clickhouse/Aggregator.h b/ydb/library/arrow_clickhouse/Aggregator.h
new file mode 100644
index 00000000000..a83baf531f1
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Aggregator.h
@@ -0,0 +1,923 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+#include "arrow_clickhouse_types.h"
+
+#include <memory>
+#include <common/StringRef.h>
+
+#include "AggregationCommon.h"
+#include <Common/Arena.h>
+#include <Common/HashTable/FixedHashMap.h>
+#include <Common/HashTable/HashMap.h>
+#include <Common/HashTable/StringHashMap.h>
+#include <Columns/ColumnsHashing.h>
+#include <Columns/ColumnAggregateFunction.h>
+#include <DataStreams/IBlockStream_fwd.h>
+
+namespace CH
+{
+
+struct AggregateDescription
+{
+    AggregateFunctionPtr function;
+    Array parameters;        /// Parameters of the (parametric) aggregate function.
+    ColumnNumbers arguments;
+    Names argument_names;    /// used if no `arguments` are specified.
+    String column_name;      /// What name to use for a column with aggregate function values
+};
+
+using AggregateDescriptions = std::vector<AggregateDescription>;
+
+/** Different data structures that can be used for aggregation
+  * For efficiency, the aggregation data itself is put into the pool.
+  * Data and pool ownership (states of aggregate functions)
+  *  is acquired later - in `convertToBlocks` function, by the ColumnAggregateFunction object.
+  *
+  * Most data structures exist in two versions: normal and two-level (TwoLevel).
+  * A two-level hash table works a little slower with a small number of different keys,
+  *  but with a large number of different keys scales better, because it allows
+  *  parallelize some operations (merging, post-processing) in a natural way.
+  *
+  * To ensure efficient work over a wide range of conditions,
+  *  first single-level hash tables are used,
+  *  and when the number of different keys is large enough,
+  *  they are converted to two-level ones.
+  *
+  * PS. There are many different approaches to the effective implementation of parallel and distributed aggregation,
+  *  best suited for different cases, and this approach is just one of them, chosen for a combination of reasons.
+  */
+
+using AggregateDataPtr = char *;
+using AggregatedDataWithoutKey = AggregateDataPtr;
+
+using AggregatedDataWithUInt8Key = FixedImplicitZeroHashMapWithCalculatedSize<UInt8, AggregateDataPtr>;
+using AggregatedDataWithUInt16Key = FixedImplicitZeroHashMap<UInt16, AggregateDataPtr>;
+
+using AggregatedDataWithUInt32Key = HashMap<UInt32, AggregateDataPtr, HashCRC32<UInt32>>;
+using AggregatedDataWithUInt64Key = HashMap<UInt64, AggregateDataPtr, HashCRC32<UInt64>>;
+
+using AggregatedDataWithShortStringKey = StringHashMap<AggregateDataPtr>;
+
+using AggregatedDataWithStringKey = HashMapWithSavedHash<StringRef, AggregateDataPtr>;
+
+using AggregatedDataWithKeys128 = HashMap<UInt128, AggregateDataPtr, UInt128HashCRC32>;
+using AggregatedDataWithKeys256 = HashMap<UInt256, AggregateDataPtr, UInt256HashCRC32>;
+
+
+/** Variants with better hash function, using more than 32 bits for hash.
+  * Using for merging phase of external aggregation, where number of keys may be far greater than 4 billion,
+  *  but we keep in memory and merge only sub-partition of them simultaneously.
+  * TODO We need to switch for better hash function not only for external aggregation,
+  *  but also for huge aggregation results on machines with terabytes of RAM.
+  */
+
+using AggregatedDataWithUInt64KeyHash64 = HashMap<UInt64, AggregateDataPtr, DefaultHash<UInt64>>;
+using AggregatedDataWithStringKeyHash64 = HashMapWithSavedHash<StringRef, AggregateDataPtr, StringRefHash>;
+using AggregatedDataWithKeys128Hash64 = HashMap<UInt128, AggregateDataPtr, UInt128Hash>;
+using AggregatedDataWithKeys256Hash64 = HashMap<UInt256, AggregateDataPtr, UInt256Hash>;
+
+template <typename Base>
+struct AggregationDataWithNullKey : public Base
+{
+    using Base::Base;
+
+    bool & hasNullKeyData() { return has_null_key_data; }
+    AggregateDataPtr & getNullKeyData() { return null_key_data; }
+    bool hasNullKeyData() const { return has_null_key_data; }
+    const AggregateDataPtr & getNullKeyData() const { return null_key_data; }
+    size_t size() const { return Base::size() + (has_null_key_data ? 1 : 0); }
+    bool empty() const { return Base::empty() && !has_null_key_data; }
+    void clear()
+    {
+        Base::clear();
+        has_null_key_data = false;
+    }
+    void clearAndShrink()
+    {
+        Base::clearAndShrink();
+        has_null_key_data = false;
+    }
+
+private:
+    bool has_null_key_data = false;
+    AggregateDataPtr null_key_data = nullptr;
+};
+
+template <typename ... Types>
+using HashTableWithNullKey = AggregationDataWithNullKey<HashMapTable<Types ...>>;
+template <typename ... Types>
+using StringHashTableWithNullKey = AggregationDataWithNullKey<StringHashMap<Types ...>>;
+
+using AggregatedDataWithNullableUInt8Key = AggregationDataWithNullKey<AggregatedDataWithUInt8Key>;
+using AggregatedDataWithNullableUInt16Key = AggregationDataWithNullKey<AggregatedDataWithUInt16Key>;
+
+using AggregatedDataWithNullableUInt64Key = AggregationDataWithNullKey<AggregatedDataWithUInt64Key>;
+using AggregatedDataWithNullableStringKey = AggregationDataWithNullKey<AggregatedDataWithStringKey>;
+
+
+/// For the case where there is one numeric key.
+/// FieldType is UInt8/16/32/64 for any type with corresponding bit width.
+template <typename FieldType, typename TData,
+        bool consecutive_keys_optimization = true>
+struct AggregationMethodOneNumber
+{
+    using Data = TData;
+    using Key = typename Data::key_type;
+    using Mapped = typename Data::mapped_type;
+
+    Data data;
+
+    AggregationMethodOneNumber() = default;
+
+    template <typename Other>
+    AggregationMethodOneNumber(const Other & other) : data(other.data) {}
+
+    /// To use one `Method` in different threads, use different `State`.
+    using State = ColumnsHashing::HashMethodOneNumber<typename Data::value_type,
+        Mapped, FieldType, consecutive_keys_optimization>;
+
+    /// Shuffle key columns before `insertKeyIntoColumns` call if needed.
+    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> &, const Sizes &) { return {}; }
+
+    // Insert the key from the hash table into columns.
+    template <typename ColPtr>
+    static void insertKeyIntoColumns(const Key & key, const std::vector<ColPtr> & key_columns, const Sizes & /*key_sizes*/)
+    {
+        insertNumber(*key_columns[0], key);
+    }
+};
+
+
+/// For the case where there is one string key.
+template <typename TData>
+struct AggregationMethodString
+{
+    using Data = TData;
+    using Key = typename Data::key_type;
+    using Mapped = typename Data::mapped_type;
+
+    Data data;
+
+    AggregationMethodString() = default;
+
+    template <typename Other>
+    AggregationMethodString(const Other & other) : data(other.data) {}
+
+    using State = ColumnsHashing::HashMethodString<typename Data::value_type, Mapped>;
+
+    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> &, const Sizes &) { return {}; }
+
+    template <typename ColPtr>
+    static void insertKeyIntoColumns(const StringRef & key, const std::vector<ColPtr> & key_columns, const Sizes &)
+    {
+        insertString(*key_columns[0], key);
+    }
+};
+
+
+/// Same as above but without cache
+template <typename TData>
+struct AggregationMethodStringNoCache
+{
+    using Data = TData;
+    using Key = typename Data::key_type;
+    using Mapped = typename Data::mapped_type;
+
+    Data data;
+
+    AggregationMethodStringNoCache() = default;
+
+    template <typename Other>
+    AggregationMethodStringNoCache(const Other & other) : data(other.data) {}
+
+    using State = ColumnsHashing::HashMethodString<typename Data::value_type, Mapped, true, false>;
+
+    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> &, const Sizes &) { return {}; }
+
+    template <typename ColPtr>
+    static void insertKeyIntoColumns(const StringRef & key, const std::vector<ColPtr> & key_columns, const Sizes &)
+    {
+        insertString(*key_columns[0], key);
+    }
+};
+
+
+/// For the case where there is one fixed-length string key.
+template <typename TData>
+struct AggregationMethodFixedString
+{
+    using Data = TData;
+    using Key = typename Data::key_type;
+    using Mapped = typename Data::mapped_type;
+
+    Data data;
+
+    AggregationMethodFixedString() = default;
+
+    template <typename Other>
+    AggregationMethodFixedString(const Other & other) : data(other.data) {}
+
+    using State = ColumnsHashing::HashMethodFixedString<typename Data::value_type, Mapped>;
+
+    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> &, const Sizes &) { return {}; }
+
+    template <typename ColPtr>
+    static void insertKeyIntoColumns(const StringRef & key, const std::vector<ColPtr> & key_columns, const Sizes &)
+    {
+        insertFixedString(*key_columns[0], key);
+    }
+};
+
+/// Same as above but without cache
+template <typename TData>
+struct AggregationMethodFixedStringNoCache
+{
+    using Data = TData;
+    using Key = typename Data::key_type;
+    using Mapped = typename Data::mapped_type;
+
+    Data data;
+
+    AggregationMethodFixedStringNoCache() = default;
+
+    template <typename Other>
+    AggregationMethodFixedStringNoCache(const Other & other) : data(other.data) {}
+
+    using State = ColumnsHashing::HashMethodFixedString<typename Data::value_type, Mapped, true, false>;
+
+    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> &, const Sizes &) { return {}; }
+
+    template <typename ColPtr>
+    static void insertKeyIntoColumns(const StringRef & key, const std::vector<ColPtr> & key_columns, const Sizes &)
+    {
+        insertFixedString(*key_columns[0], key);
+    }
+};
+
+
+
+/// For the case where all keys are of fixed length, and they fit in N (for example, 128) bits.
+template <typename TData, bool has_nullable_keys_ = false, bool use_cache = true>
+struct AggregationMethodKeysFixed
+{
+    using Data = TData;
+    using Key = typename Data::key_type;
+    using Mapped = typename Data::mapped_type;
+    static constexpr bool has_nullable_keys = has_nullable_keys_;
+
+    Data data;
+
+    AggregationMethodKeysFixed() = default;
+
+    template <typename Other>
+    AggregationMethodKeysFixed(const Other & other) : data(other.data) {}
+
+    using State = ColumnsHashing::HashMethodKeysFixed<
+        typename Data::value_type,
+        Key,
+        Mapped,
+        has_nullable_keys,
+        false,
+        use_cache>;
+#if 0
+    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> & key_columns, const Sizes & key_sizes)
+    {
+        return State::shuffleKeyColumns(key_columns, key_sizes);
+    }
+#endif
+    template <typename ColPtr>
+    static void insertKeyIntoColumns(const Key & key, const std::vector<ColPtr> & key_columns, const Sizes & key_sizes)
+    {
+        size_t keys_count = key_columns.size();
+
+        static constexpr auto bitmap_size = has_nullable_keys ? std::tuple_size<KeysNullMap<Key>>::value : 0;
+        /// In any hash key value, column values to be read start just after the bitmap, if it exists.
+        const char * key_data = reinterpret_cast<const char *>(&key) + bitmap_size;
+
+        for (size_t i = 0; i < keys_count; ++i)
+        {
+            auto & observed_column = *key_columns[i];
+
+            if constexpr (has_nullable_keys)
+            {
+                const char * null_bitmap = reinterpret_cast<const char *>(&key);
+                size_t bucket = i / 8;
+                size_t offset = i % 8;
+                bool is_null = (null_bitmap[bucket] >> offset) & 1;
+
+                if (is_null)
+                {
+                    observed_column.AppendNull().ok();
+                    continue;
+                }
+            }
+
+            insertData(observed_column, StringRef(key_data, key_sizes[i]));
+            key_data += key_sizes[i];
+        }
+    }
+};
+
+
+/** Aggregates by concatenating serialized key values.
+  * The serialized value differs in that it uniquely allows to deserialize it, having only the position with which it starts.
+  * That is, for example, for strings, it contains first the serialized length of the string, and then the bytes.
+  * Therefore, when aggregating by several strings, there is no ambiguity.
+  */
+template <typename TData>
+struct AggregationMethodSerialized
+{
+    using Data = TData;
+    using Key = typename Data::key_type;
+    using Mapped = typename Data::mapped_type;
+
+    Data data;
+
+    AggregationMethodSerialized() = default;
+
+    template <typename Other>
+    AggregationMethodSerialized(const Other & other) : data(other.data) {}
+
+    using State = ColumnsHashing::HashMethodSerialized<typename Data::value_type, Mapped>;
+
+    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> &, const Sizes &) { return {}; }
+
+    template <typename ColPtr>
+    static void insertKeyIntoColumns(const StringRef & key, const std::vector<ColPtr> & key_columns, const Sizes &)
+    {
+        const auto * pos = key.data;
+        for (auto & column : key_columns)
+            pos = deserializeAndInsertFromArena(*column, pos);
+    }
+};
+
+
+class Aggregator;
+
+using ColumnsHashing::HashMethodContext;
+using ColumnsHashing::HashMethodContextPtr;
+
+struct AggregatedDataVariants //: private boost::noncopyable
+{
+    /** Working with states of aggregate functions in the pool is arranged in the following (inconvenient) way:
+      * - when aggregating, states are created in the pool using IAggregateFunction::create (inside - `placement new` of arbitrary structure);
+      * - they must then be destroyed using IAggregateFunction::destroy (inside - calling the destructor of arbitrary structure);
+      * - if aggregation is complete, then, in the Aggregator::convertToBlocks function, pointers to the states of aggregate functions
+      *   are written to ColumnAggregateFunction; ColumnAggregateFunction "acquires ownership" of them, that is - calls `destroy` in its destructor.
+      * - if during the aggregation, before call to Aggregator::convertToBlocks, an exception was thrown,
+      *   then the states of aggregate functions must still be destroyed,
+      *   otherwise, for complex states (eg, AggregateFunctionUniq), there will be memory leaks;
+      * - in this case, to destroy states, the destructor calls Aggregator::destroyAggregateStates method,
+      *   but only if the variable aggregator (see below) is not nullptr;
+      * - that is, until you transfer ownership of the aggregate function states in the ColumnAggregateFunction, set the variable `aggregator`,
+      *   so that when an exception occurs, the states are correctly destroyed.
+      *
+      * PS. This can be corrected by making a pool that knows about which states of aggregate functions and in which order are put in it, and knows how to destroy them.
+      * But this can hardly be done simply because it is planned to put variable-length strings into the same pool.
+      * In this case, the pool will not be able to know with what offsets objects are stored.
+      */
+    const Aggregator * aggregator = nullptr;
+
+    size_t keys_size{};  /// Number of keys. NOTE do we need this field?
+    Sizes key_sizes;     /// Dimensions of keys, if keys of fixed length
+
+    /// Pools for states of aggregate functions. Ownership will be later transferred to ColumnAggregateFunction.
+    Arenas aggregates_pools;
+    Arena * aggregates_pool{};    /// The pool that is currently used for allocation.
+
+    /** Specialization for the case when there are no keys, and for keys not fitted into max_rows_to_group_by.
+      */
+    AggregatedDataWithoutKey without_key = nullptr;
+
+    // Disable consecutive key optimization for Uint8/16, because they use a FixedHashMap
+    // and the lookup there is almost free, so we don't need to cache the last lookup result
+    std::unique_ptr<AggregationMethodOneNumber<UInt8, AggregatedDataWithUInt8Key, false>>    key8;
+    std::unique_ptr<AggregationMethodOneNumber<UInt16, AggregatedDataWithUInt16Key, false>>  key16;
+
+    std::unique_ptr<AggregationMethodOneNumber<UInt32, AggregatedDataWithUInt64Key>>         key32;
+    std::unique_ptr<AggregationMethodOneNumber<UInt64, AggregatedDataWithUInt64Key>>         key64;
+    std::unique_ptr<AggregationMethodStringNoCache<AggregatedDataWithShortStringKey>>        key_string;
+    std::unique_ptr<AggregationMethodFixedStringNoCache<AggregatedDataWithShortStringKey>>   key_fixed_string;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithUInt16Key, false, false>>   keys16;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithUInt32Key>>                 keys32;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithUInt64Key>>                 keys64;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys128>>                   keys128;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys256>>                   keys256;
+    std::unique_ptr<AggregationMethodSerialized<AggregatedDataWithStringKey>>                serialized;
+
+    std::unique_ptr<AggregationMethodOneNumber<UInt64, AggregatedDataWithUInt64KeyHash64>>   key64_hash64;
+    std::unique_ptr<AggregationMethodString<AggregatedDataWithStringKeyHash64>>              key_string_hash64;
+    std::unique_ptr<AggregationMethodFixedString<AggregatedDataWithStringKeyHash64>>         key_fixed_string_hash64;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys128Hash64>>             keys128_hash64;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys256Hash64>>             keys256_hash64;
+    std::unique_ptr<AggregationMethodSerialized<AggregatedDataWithStringKeyHash64>>          serialized_hash64;
+
+    /// Support for nullable keys.
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys128, true>>             nullable_keys128;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys256, true>>             nullable_keys256;
+
+    /// In this and similar macros, the option without_key is not considered.
+    #define APPLY_FOR_AGGREGATED_VARIANTS(M) \
+        M(key8) \
+        M(key16) \
+        M(key32) \
+        M(key64) \
+        M(key_string) \
+        M(key_fixed_string) \
+        M(keys16) \
+        M(keys32) \
+        M(keys64) \
+        M(keys128) \
+        M(keys256) \
+        M(serialized) \
+        M(key64_hash64) \
+        M(key_string_hash64) \
+        M(key_fixed_string_hash64) \
+        M(keys128_hash64) \
+        M(keys256_hash64) \
+        M(serialized_hash64) \
+        M(nullable_keys128) \
+        M(nullable_keys256) \
+
+
+    enum class Type
+    {
+        EMPTY = 0,
+        without_key,
+
+    #define M(NAME) NAME,
+        APPLY_FOR_AGGREGATED_VARIANTS(M)
+    #undef M
+    };
+    Type type = Type::EMPTY;
+
+    AggregatedDataVariants()
+        : aggregates_pools(1, std::make_shared<Arena>())
+        , aggregates_pool(aggregates_pools.back().get())
+    {}
+
+    bool empty() const { return type == Type::EMPTY; }
+    void invalidate() { type = Type::EMPTY; }
+
+    ~AggregatedDataVariants();
+
+    void init(Type type_)
+    {
+        switch (type_)
+        {
+            case Type::EMPTY:       break;
+            case Type::without_key: break;
+
+        #define M(NAME) \
+            case Type::NAME: NAME = std::make_unique<decltype(NAME)::element_type>(); break;
+            APPLY_FOR_AGGREGATED_VARIANTS(M)
+        #undef M
+        }
+
+        type = type_;
+    }
+
+    /// Number of rows (different keys).
+    size_t size() const
+    {
+        switch (type)
+        {
+            case Type::EMPTY:       return 0;
+            case Type::without_key: return 1;
+
+        #define M(NAME) \
+            case Type::NAME: return NAME->data.size() + (without_key != nullptr);
+            APPLY_FOR_AGGREGATED_VARIANTS(M)
+        #undef M
+        }
+
+        __builtin_unreachable();
+    }
+
+    /// The size without taking into account the row in which data is written for the calculation of TOTALS.
+    size_t sizeWithoutOverflowRow() const
+    {
+        switch (type)
+        {
+            case Type::EMPTY:       return 0;
+            case Type::without_key: return 1;
+
+            #define M(NAME) \
+            case Type::NAME: return NAME->data.size();
+            APPLY_FOR_AGGREGATED_VARIANTS(M)
+            #undef M
+        }
+
+        __builtin_unreachable();
+    }
+
+    const char * getMethodName() const
+    {
+        switch (type)
+        {
+            case Type::EMPTY:       return "EMPTY";
+            case Type::without_key: return "without_key";
+
+        #define M(NAME) \
+            case Type::NAME: return #NAME;
+            APPLY_FOR_AGGREGATED_VARIANTS(M)
+        #undef M
+        }
+
+        __builtin_unreachable();
+    }
+
+    static HashMethodContextPtr createCache(Type type, const HashMethodContext::Settings & settings)
+    {
+        switch (type)
+        {
+            case Type::without_key: return nullptr;
+
+            #define M(NAME) \
+            case Type::NAME: \
+            { \
+                using TPtr ## NAME = decltype(AggregatedDataVariants::NAME); \
+                using T ## NAME = typename TPtr ## NAME ::element_type; \
+                return T ## NAME ::State::createContext(settings); \
+            }
+
+            APPLY_FOR_AGGREGATED_VARIANTS(M)
+            #undef M
+
+            default:
+                throw Exception("Unknown aggregated data variant.");
+        }
+    }
+};
+
+using AggregatedDataVariantsPtr = std::shared_ptr<AggregatedDataVariants>;
+using ManyAggregatedDataVariants = std::vector<AggregatedDataVariantsPtr>;
+using ManyAggregatedDataVariantsPtr = std::shared_ptr<ManyAggregatedDataVariants>;
+
+/** How are "total" values calculated with WITH TOTALS?
+  * (For more details, see TotalsHavingTransform.)
+  *
+  * In the absence of group_by_overflow_mode = 'any', the data is aggregated as usual, but the states of the aggregate functions are not finalized.
+  * Later, the aggregate function states for all rows (passed through HAVING) are merged into one - this will be TOTALS.
+  *
+  * If there is group_by_overflow_mode = 'any', the data is aggregated as usual, except for the keys that did not fit in max_rows_to_group_by.
+  * For these keys, the data is aggregated into one additional row - see below under the names `overflow_row`, `overflows`...
+  * Later, the aggregate function states for all rows (passed through HAVING) are merged into one,
+  *  also overflow_row is added or not added (depending on the totals_mode setting) also - this will be TOTALS.
+  */
+
+
+/** Aggregates the source of the blocks.
+  */
+class Aggregator final
+{
+public:
+    struct Params
+    {
+        /// Data structure of source blocks.
+        Header src_header;
+        /// Data structure of intermediate blocks before merge.
+        Header intermediate_header;
+
+        /// What to count.
+        const ColumnNumbers keys;
+        const AggregateDescriptions aggregates;
+        const size_t keys_size;
+        const size_t aggregates_size;
+
+        /// The settings of approximate calculation of GROUP BY.
+        const bool overflow_row;    /// Do we need to put into AggregatedDataVariants::without_key aggregates for keys that are not in max_rows_to_group_by.
+        const size_t max_rows_to_group_by = 0;
+        const OverflowMode group_by_overflow_mode = OverflowMode::THROW;
+
+        /// Return empty result when aggregating without keys on empty set.
+        bool empty_result_for_aggregation_by_empty_set = false;
+
+        /// Settings is used to determine cache size. No threads are created.
+        size_t max_threads;
+
+        Params(const Header & src_header_,
+               const Header & intermediate_header_,
+               const ColumnNumbers & keys_,
+               const AggregateDescriptions & aggregates_,
+               bool overflow_row_,
+               size_t max_threads_ = 1)
+            : src_header(src_header_)
+            , intermediate_header(intermediate_header_)
+            , keys(keys_)
+            , aggregates(aggregates_)
+            , keys_size(keys.size())
+            , aggregates_size(aggregates.size())
+            , overflow_row(overflow_row_)
+            , max_threads(max_threads_)
+        {}
+
+        Params(bool is_megre, const Header & header_,
+            const ColumnNumbers & keys_, const AggregateDescriptions & aggregates_, bool overflow_row_, size_t max_threads_ = 1)
+            : Params((is_megre ? Header() : header_), (is_megre ? header_ : Header()), keys_, aggregates_, overflow_row_, max_threads_)
+        {}
+
+        static Header getHeader(
+            const Header & src_header,
+            const Header & intermediate_header,
+            const ColumnNumbers & keys,
+            const AggregateDescriptions & aggregates,
+            bool final);
+
+        Header getHeader(bool final) const
+        {
+            return getHeader(src_header, intermediate_header, keys, aggregates, final);
+        }
+    };
+
+    explicit Aggregator(const Params & params_);
+
+    /// Aggregate the source. Get the result in the form of one of the data structures.
+    void execute(const BlockInputStreamPtr & stream, AggregatedDataVariants & result);
+
+    using AggregateColumns = std::vector<ColumnRawPtrs>;
+    using AggregateFunctionsPlainPtrs = std::vector<const IAggregateFunction *>;
+
+    /// Process one block. Return false if the processing should be aborted (with group_by_overflow_mode = 'break').
+    bool executeOnBlock(const Block & block,
+        AggregatedDataVariants & result,
+        ColumnRawPtrs & key_columns,
+        AggregateColumns & aggregate_columns, /// Passed to not create them anew for each block
+        bool & no_more_keys) const;
+
+    bool executeOnBlock(Columns columns,
+        size_t row_begin, size_t row_end,
+        AggregatedDataVariants & result,
+        ColumnRawPtrs & key_columns,
+        AggregateColumns & aggregate_columns, /// Passed to not create them anew for each block
+        bool & no_more_keys) const;
+
+    /// Used for aggregate projection.
+    bool mergeOnBlock(Block block, AggregatedDataVariants & result, bool & no_more_keys) const;
+
+    /** Convert the aggregation data structure into a block.
+      * If overflow_row = true, then aggregates for rows that are not included in max_rows_to_group_by are put in the first block.
+      *
+      * If final = false, then ColumnAggregateFunction is created as the aggregation columns with the state of the calculations,
+      *  which can then be combined with other states (for distributed query processing).
+      * If final = true, then columns with ready values are created as aggregate columns.
+      */
+    BlocksList convertToBlocks(AggregatedDataVariants & data_variants, bool final) const;
+
+    ManyAggregatedDataVariants prepareVariantsToMerge(ManyAggregatedDataVariants & data_variants) const;
+
+    /** Merge the stream of partially aggregated blocks into one data structure.
+      * (Pre-aggregate several blocks that represent the result of independent aggregations from remote servers.)
+      */
+    void mergeStream(const BlockInputStreamPtr & stream, AggregatedDataVariants & result);
+
+    using BucketToBlocks = std::map<Int32, BlocksList>;
+    /// Merge partially aggregated blocks separated to buckets into one data structure.
+    void mergeBlocks(BucketToBlocks && bucket_to_blocks, AggregatedDataVariants & result);
+
+    /// Merge several partially aggregated blocks into one.
+    /// Precondition: for all blocks block.info.is_overflows flag must be the same.
+    /// (either all blocks are from overflow data or none blocks are).
+    /// The resulting block has the same value of is_overflows flag.
+    Block mergeBlocks(BlocksList & blocks, bool final);
+
+    /// Get data structure of the result.
+    Header getHeader(bool final) const;
+
+private:
+    friend struct AggregatedDataVariants;
+    friend class MergingAndConvertingBlockInputStream;
+
+    Params params;
+
+    AggregatedDataVariants::Type method_chosen;
+    Sizes key_sizes;
+
+    HashMethodContextPtr aggregation_state_cache;
+
+    AggregateFunctionsPlainPtrs aggregate_functions;
+
+    /** This array serves two purposes.
+      *
+      * Function arguments are collected side by side, and they do not need to be collected from different places. Also the array is made zero-terminated.
+      * The inner loop (for the case without_key) is almost twice as compact; performance gain of about 30%.
+      */
+    struct AggregateFunctionInstruction
+    {
+        const IAggregateFunction * that{};
+        size_t state_offset{};
+        const IColumn ** arguments{};
+        const IAggregateFunction * batch_that{};
+        const IColumn ** batch_arguments{};
+    };
+
+    using AggregateFunctionInstructions = std::vector<AggregateFunctionInstruction>;
+
+    Sizes offsets_of_aggregate_states;    /// The offset to the n-th aggregate function in a row of aggregate functions.
+    size_t total_size_of_aggregate_states = 0;    /// The total size of the row from the aggregate functions.
+
+    // add info to track alignment requirement
+    // If there are states whose alignment are v1, ..vn, align_aggregate_states will be max(v1, ... vn)
+    size_t align_aggregate_states = 1;
+
+    bool all_aggregates_has_trivial_destructor = false;
+
+    /** Select the aggregation method based on the number and types of keys. */
+    AggregatedDataVariants::Type chooseAggregationMethod();
+
+    /** Create states of aggregate functions for one key.
+      */
+    void createAggregateStates(AggregateDataPtr & aggregate_data) const;
+
+    /** Call `destroy` methods for states of aggregate functions.
+      * Used in the exception handler for aggregation, since RAII in this case is not applicable.
+      */
+    void destroyAllAggregateStates(AggregatedDataVariants & result) const;
+
+
+    /// Process one data block, aggregate the data into a hash table.
+    template <typename Method>
+    void executeImpl(
+        Method & method,
+        Arena * aggregates_pool,
+        size_t row_begin,
+        size_t row_end,
+        ColumnRawPtrs & key_columns,
+        AggregateFunctionInstruction * aggregate_instructions,
+        bool no_more_keys,
+        AggregateDataPtr overflow_row) const;
+
+    /// Specialization for a particular value no_more_keys.
+    template <bool no_more_keys, typename Method>
+    void executeImplBatch(
+        Method & method,
+        typename Method::State & state,
+        Arena * aggregates_pool,
+        size_t row_begin,
+        size_t row_end,
+        AggregateFunctionInstruction * aggregate_instructions,
+        AggregateDataPtr overflow_row) const;
+
+    /// For case when there are no keys (all aggregate into one row).
+    void executeWithoutKeyImpl(
+        AggregatedDataWithoutKey & res,
+        size_t row_begin,
+        size_t row_end,
+        AggregateFunctionInstruction * aggregate_instructions,
+        Arena * arena) const;
+
+    /// Merge NULL key data from hash table `src` into `dst`.
+    template <typename Method, typename Table>
+    void mergeDataNullKey(
+            Table & table_dst,
+            Table & table_src,
+            Arena * arena) const;
+
+    /// Merge data from hash table `src` into `dst`.
+    template <typename Method, typename Table>
+    void mergeDataImpl(
+        Table & table_dst,
+        Table & table_src,
+        Arena * arena) const;
+
+    /// Merge data from hash table `src` into `dst`, but only for keys that already exist in dst. In other cases, merge the data into `overflows`.
+    template <typename Method, typename Table>
+    void mergeDataNoMoreKeysImpl(
+        Table & table_dst,
+        AggregatedDataWithoutKey & overflows,
+        Table & table_src,
+        Arena * arena) const;
+
+    /// Same, but ignores the rest of the keys.
+    template <typename Method, typename Table>
+    void mergeDataOnlyExistingKeysImpl(
+        Table & table_dst,
+        Table & table_src,
+        Arena * arena) const;
+
+    void mergeWithoutKeyDataImpl(
+        ManyAggregatedDataVariants & non_empty_data) const;
+
+    template <typename Method>
+    void mergeSingleLevelDataImpl(
+        ManyAggregatedDataVariants & non_empty_data) const;
+
+    template <typename Method, typename Table>
+    void convertToBlockImpl(
+        Method & method,
+        Table & data,
+        MutableColumns & key_columns,
+        AggregateColumnsData & aggregate_columns,
+        MutableColumns & final_aggregate_columns,
+        Arena * arena,
+        bool final) const;
+
+    template <typename Mapped>
+    void insertAggregatesIntoColumns(
+        Mapped & mapped,
+        MutableColumns & final_aggregate_columns,
+        Arena * arena) const;
+
+    template <typename Method, typename Table>
+    void convertToBlockImplFinal(
+        Method & method,
+        Table & data,
+        const MutableColumns & key_columns,
+        MutableColumns & final_aggregate_columns,
+        Arena * arena) const;
+
+    template <typename Method, typename Table>
+    void convertToBlockImplNotFinal(
+        Method & method,
+        Table & data,
+        const MutableColumns & key_columns,
+        AggregateColumnsData & aggregate_columns) const;
+
+    template <typename Filler>
+    Block prepareBlockAndFill(
+        AggregatedDataVariants & data_variants,
+        bool final,
+        size_t rows,
+        Filler && filler) const;
+
+    template <typename Method>
+    Block convertOneBucketToBlock(
+        AggregatedDataVariants & data_variants,
+        Method & method,
+        Arena * arena,
+        bool final,
+        size_t bucket) const;
+
+    Block prepareBlockAndFillWithoutKey(AggregatedDataVariants & data_variants, bool final, bool is_overflows = false) const;
+    Block prepareBlockAndFillSingleLevel(AggregatedDataVariants & data_variants, bool final) const;
+
+    template <bool no_more_keys, typename Method, typename Table>
+    void mergeStreamsImplCase(
+        Block & block,
+        Arena * aggregates_pool,
+        Method & method,
+        Table & data,
+        AggregateDataPtr overflow_row) const;
+
+    template <typename Method, typename Table>
+    void mergeStreamsImpl(
+        Block & block,
+        Arena * aggregates_pool,
+        Method & method,
+        Table & data,
+        AggregateDataPtr overflow_row,
+        bool no_more_keys) const;
+
+    void mergeWithoutKeyStreamsImpl(
+        Block & block,
+        AggregatedDataVariants & result) const;
+
+    template <typename Method>
+    void mergeBucketImpl(
+        ManyAggregatedDataVariants & data, Int32 bucket, Arena * arena, std::atomic<bool> * is_cancelled = nullptr) const;
+
+    template <typename Method>
+    void convertBlockToTwoLevelImpl(
+        Method & method,
+        Arena * pool,
+        ColumnRawPtrs & key_columns,
+        const Block & source,
+        std::vector<Block> & destinations) const;
+
+    template <typename Method, typename Table>
+    void destroyImpl(Table & table) const;
+
+    void destroyWithoutKey(
+        AggregatedDataVariants & result) const;
+
+
+    /** Checks constraints on the maximum number of keys for aggregation.
+      * If it is exceeded, then, depending on the group_by_overflow_mode, either
+      * - throws an exception;
+      * - returns false, which means that execution must be aborted;
+      * - sets the variable no_more_keys to true.
+      */
+    bool checkLimits(size_t result_size, bool & no_more_keys) const;
+
+    void prepareAggregateInstructions(
+        Columns columns,
+        AggregateColumns & aggregate_columns,
+        Columns & materialized_columns,
+        AggregateFunctionInstructions & instructions) const;
+};
+
+
+/** Get the aggregation variant by its type. */
+template <typename Method> Method & getDataVariant(AggregatedDataVariants & variants);
+
+#define M(NAME) \
+    template <> inline decltype(AggregatedDataVariants::NAME)::element_type & getDataVariant<decltype(AggregatedDataVariants::NAME)::element_type>(AggregatedDataVariants & variants) { return *variants.NAME; }
+
+APPLY_FOR_AGGREGATED_VARIANTS(M)
+
+#undef M
+
+}
diff --git a/ydb/library/arrow_clickhouse/CMakeLists.txt b/ydb/library/arrow_clickhouse/CMakeLists.txt
new file mode 100644
index 00000000000..97a7a6b939f
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/CMakeLists.txt
@@ -0,0 +1,29 @@
+
+# This file was gererated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(ydb-library-arrow_clickhouse)
+target_include_directories(ydb-library-arrow_clickhouse PUBLIC
+  ${CMAKE_SOURCE_DIR}/ydb/library/yql/udfs/common/clickhouse/client/base
+)
+target_include_directories(ydb-library-arrow_clickhouse PRIVATE
+  ${CMAKE_SOURCE_DIR}/ydb/library/arrow_clickhouse
+)
+target_link_libraries(ydb-library-arrow_clickhouse PUBLIC
+  contrib-libs-cxxsupp
+  yutil
+  libs-apache-arrow
+  library-arrow_clickhouse-Common
+  library-arrow_clickhouse-Columns
+  library-arrow_clickhouse-DataStreams
+)
+target_sources(ydb-library-arrow_clickhouse PRIVATE
+  ${CMAKE_SOURCE_DIR}/ydb/library/arrow_clickhouse/Aggregator.cpp
+  ${CMAKE_SOURCE_DIR}/ydb/library/yql/udfs/common/clickhouse/client/base/common/mremap.cpp
+  ${CMAKE_SOURCE_DIR}/ydb/library/yql/udfs/common/clickhouse/client/base/common/getPageSize.cpp
+)
diff --git a/ydb/library/arrow_clickhouse/Columns/CMakeLists.txt b/ydb/library/arrow_clickhouse/Columns/CMakeLists.txt
new file mode 100644
index 00000000000..0f4c153a983
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Columns/CMakeLists.txt
@@ -0,0 +1,23 @@
+
+# This file was gererated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(library-arrow_clickhouse-Columns)
+target_include_directories(library-arrow_clickhouse-Columns PRIVATE
+  ${CMAKE_SOURCE_DIR}/ydb/library/yql/udfs/common/clickhouse/client/base
+  ${CMAKE_SOURCE_DIR}/ydb/library/arrow_clickhouse
+)
+target_link_libraries(library-arrow_clickhouse-Columns PUBLIC
+  contrib-libs-cxxsupp
+  yutil
+  libs-apache-arrow
+)
+target_sources(library-arrow_clickhouse-Columns PRIVATE
+  ${CMAKE_SOURCE_DIR}/ydb/library/arrow_clickhouse/Columns/ColumnsCommon.cpp
+  ${CMAKE_SOURCE_DIR}/ydb/library/arrow_clickhouse/Columns/ColumnAggregateFunction.cpp
+)
diff --git a/ydb/library/arrow_clickhouse/Columns/ColumnAggregateFunction.cpp b/ydb/library/arrow_clickhouse/Columns/ColumnAggregateFunction.cpp
new file mode 100644
index 00000000000..7d854890f3c
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Columns/ColumnAggregateFunction.cpp
@@ -0,0 +1,31 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#include <sstream>
+
+#include <Columns/ColumnAggregateFunction.h>
+#include <Columns/ColumnsCommon.h>
+#include <Common/Arena.h>
+#include <Common/HashTable/Hash.h>
+
+
+namespace CH
+{
+
+std::shared_ptr<arrow::Array> DataTypeAggregateFunction::MakeArray(std::shared_ptr<arrow::ArrayData> data) const
+{
+    return std::make_shared<ColumnAggregateFunction>(data);
+}
+
+ColumnAggregateFunction::~ColumnAggregateFunction()
+{
+    if (!func->hasTrivialDestructor() && !src)
+    {
+        auto & arr = getData();
+        for (int64_t i = 0; i < arr.length(); ++i)
+            func->destroy(reinterpret_cast<AggregateDataPtr>(arr.Value(i)));
+    }
+}
+
+}
diff --git a/ydb/library/arrow_clickhouse/Columns/ColumnAggregateFunction.h b/ydb/library/arrow_clickhouse/Columns/ColumnAggregateFunction.h
new file mode 100644
index 00000000000..cf126113230
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Columns/ColumnAggregateFunction.h
@@ -0,0 +1,150 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+#include "arrow_clickhouse_types.h"
+
+#include <AggregateFunctions/IAggregateFunction.h>
+
+#include <common/StringRef.h>
+#include <ranges>
+
+namespace CH
+{
+
+class DataTypeAggregateFunction final : public arrow::ExtensionType
+{
+public:
+    static constexpr const char * FAMILY_NAME = "aggregate_function";
+
+    DataTypeAggregateFunction(const AggregateFunctionPtr & function_,
+                              const DataTypes & argument_types_,
+                              const Array & parameters_)
+        : arrow::ExtensionType(arrow::uint64())
+        , function(function_)
+        , argument_types(argument_types_)
+        , parameters(parameters_)
+    {}
+
+    std::string extension_name() const override { return FAMILY_NAME; }
+
+    bool ExtensionEquals(const arrow::ExtensionType& other) const override
+    {
+        return extension_name() != other.extension_name(); // TODO
+    }
+
+    std::shared_ptr<arrow::Array> MakeArray(std::shared_ptr<arrow::ArrayData> data) const override;
+
+    virtual arrow::Result<std::shared_ptr<arrow::DataType>> Deserialize(std::shared_ptr<arrow::DataType> /*storage_type*/,
+                                                                        const std::string& /*serialized_data*/) const override
+    {
+        return std::make_shared<DataTypeAggregateFunction>(AggregateFunctionPtr{}, DataTypes{}, Array{}); // TODO
+    }
+
+    std::string Serialize() const override { return {}; } // TODO
+
+    AggregateFunctionPtr getFunction() const { return function; }
+
+private:
+    AggregateFunctionPtr function;
+    DataTypes argument_types;
+    Array parameters;
+};
+
+/** Column of states of aggregate functions.
+  * Presented as an array of pointers to the states of aggregate functions (data).
+  * The states themselves are stored in one of the pools (arenas).
+  *
+  * It can be in two variants:
+  *
+  * 1. Own its values - that is, be responsible for destroying them.
+  * The column consists of the values "assigned to it" after the aggregation is performed (see Aggregator, convertToBlocks function),
+  *  or from values created by itself (see `insert` method).
+  * In this case, `src` will be `nullptr`, and the column itself will be destroyed (call `IAggregateFunction::destroy`)
+  *  states of aggregate functions in the destructor.
+  *
+  * 2. Do not own its values, but use values taken from another ColumnAggregateFunction column.
+  * For example, this is a column obtained by permutation/filtering or other transformations from another column.
+  * In this case, `src` will be `shared ptr` to the source column. Destruction of values will be handled by this source column.
+  *
+  * This solution is somewhat limited:
+  * - the variant in which the column contains a part of "it's own" and a part of "another's" values is not supported;
+  * - the option of having multiple source columns is not supported, which may be necessary for a more optimal merge of the two columns.
+  *
+  * These restrictions can be removed if you add an array of flags or even refcount,
+  *  specifying which individual values should be destroyed and which ones should not.
+  * Clearly, this method would have a substantially non-zero price.
+  */
+class ColumnAggregateFunction final : public arrow::ExtensionArray
+{
+private:
+#if 0
+    /// Arenas used by function states that are created elsewhere. We own these
+    /// arenas in the sense of extending their lifetime, but do not modify them.
+    /// Even reading these arenas is unsafe, because they may be shared with
+    /// other data blocks and modified by other threads concurrently.
+    ConstArenas foreign_arenas;
+#endif
+    /// Used for destroying states and for finalization of values.
+    AggregateFunctionPtr func;
+
+    /// Source column. Used (holds source from destruction),
+    ///  if this column has been constructed from another and uses all or part of its values.
+    ColumnPtr src;
+
+public:
+    ColumnAggregateFunction(const std::shared_ptr<DataTypeAggregateFunction> & data_type)
+        : arrow::ExtensionArray(data_type, *arrow::MakeArrayOfNull(arrow::uint64(), 0))
+        , func(data_type->getFunction())
+    {}
+
+    explicit ColumnAggregateFunction(const std::shared_ptr<arrow::ArrayData>& data)
+        : arrow::ExtensionArray(data)
+        , func(std::static_pointer_cast<DataTypeAggregateFunction>(data->type)->getFunction())
+    {}
+
+    ~ColumnAggregateFunction() override;
+
+    const arrow::UInt64Array & getData() const { return static_cast<arrow::UInt64Array &>(*storage()); }
+    const AggregateDataPtr * rawData() const { return reinterpret_cast<const AggregateDataPtr *>(getData().raw_values()); }
+};
+
+
+class MutableColumnAggregateFunction final : public arrow::ArrayBuilder
+{
+public:
+    MutableColumnAggregateFunction(const std::shared_ptr<DataTypeAggregateFunction> & data_type_,
+                                   arrow::MemoryPool* pool = arrow::default_memory_pool())
+        : arrow::ArrayBuilder(pool)
+        , data_type(data_type_)
+        , builder(std::make_shared<arrow::UInt64Builder>(pool))
+    {}
+
+    std::shared_ptr<arrow::DataType> type() const override { return data_type; }
+
+    arrow::Status AppendNull() override { return arrow::Status(arrow::StatusCode::NotImplemented, __FUNCTION__); }
+    arrow::Status AppendNulls(int64_t) override { return arrow::Status(arrow::StatusCode::NotImplemented, __FUNCTION__); }
+    arrow::Status AppendEmptyValue() override { return arrow::Status(arrow::StatusCode::NotImplemented, __FUNCTION__); }
+    arrow::Status AppendEmptyValues(int64_t) override { return arrow::Status(arrow::StatusCode::NotImplemented, __FUNCTION__); }
+
+    arrow::Status FinishInternal(std::shared_ptr<arrow::ArrayData>* out) override
+    {
+        auto array = *builder->Finish();
+        *out = array->data()->Copy();
+        (*out)->type = data_type;
+        // TODO: add arenas
+        return arrow::Status::OK();
+    }
+
+    arrow::UInt64Builder & getData() { return *builder; }
+
+private:
+    std::shared_ptr<DataTypeAggregateFunction> data_type;
+    std::shared_ptr<arrow::UInt64Builder> builder;
+};
+
+using AggregateColumnsData = std::vector<arrow::UInt64Builder *>;
+using AggregateColumnsConstData = std::vector<const arrow::UInt64Array *>;
+
+}
diff --git a/ydb/library/arrow_clickhouse/Columns/ColumnsCommon.cpp b/ydb/library/arrow_clickhouse/Columns/ColumnsCommon.cpp
new file mode 100644
index 00000000000..3dbc50b22ff
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Columns/ColumnsCommon.cpp
@@ -0,0 +1,530 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#ifdef __SSE2__
+    #include <emmintrin.h>
+#endif
+
+#include <Columns/ColumnsCommon.h>
+#include <Common/HashTable/HashSet.h>
+#include <Common/PODArray.h>
+
+
+namespace CH
+{
+
+#if defined(__SSE2__) && defined(__POPCNT__)
+/// Transform 64-byte mask to 64-bit mask.
+static UInt64 toBits64(const Int8 * bytes64)
+{
+    static const __m128i zero16 = _mm_setzero_si128();
+    UInt64 res =
+        static_cast<UInt64>(_mm_movemask_epi8(_mm_cmpeq_epi8(
+            _mm_loadu_si128(reinterpret_cast<const __m128i *>(bytes64)), zero16)))
+        | (static_cast<UInt64>(_mm_movemask_epi8(_mm_cmpeq_epi8(
+            _mm_loadu_si128(reinterpret_cast<const __m128i *>(bytes64 + 16)), zero16))) << 16)
+        | (static_cast<UInt64>(_mm_movemask_epi8(_mm_cmpeq_epi8(
+            _mm_loadu_si128(reinterpret_cast<const __m128i *>(bytes64 + 32)), zero16))) << 32)
+        | (static_cast<UInt64>(_mm_movemask_epi8(_mm_cmpeq_epi8(
+            _mm_loadu_si128(reinterpret_cast<const __m128i *>(bytes64 + 48)), zero16))) << 48);
+
+    return ~res;
+}
+#endif
+
+size_t countBytesInFilter(const uint8_t * filt, size_t start, size_t end)
+{
+    size_t count = 0;
+
+    /** NOTE: In theory, `filt` should only contain zeros and ones.
+      * But, just in case, here the condition > 0 (to signed bytes) is used.
+      * It would be better to use != 0, then this does not allow SSE2.
+      */
+
+    const Int8 * pos = reinterpret_cast<const Int8 *>(filt);
+    pos += start;
+
+    const Int8 * end_pos = pos + (end - start);
+
+#if defined(__SSE2__)
+    const Int8 * end_pos64 = pos + (end - start) / 64 * 64;
+
+    for (; pos < end_pos64; pos += 64)
+        count += std::popcount(toBits64(pos));
+
+    /// TODO Add duff device for tail?
+#endif
+
+    for (; pos < end_pos; ++pos)
+        count += *pos != 0;
+
+    return count;
+}
+
+size_t countBytesInFilterWithNull(const uint8_t * filt, const uint8_t * null_map, size_t start, size_t end)
+{
+    size_t count = 0;
+
+    /** NOTE: In theory, `filt` should only contain zeros and ones.
+      * But, just in case, here the condition > 0 (to signed bytes) is used.
+      * It would be better to use != 0, then this does not allow SSE2.
+      */
+
+    const Int8 * pos = reinterpret_cast<const Int8 *>(filt) + start;
+    const Int8 * pos2 = reinterpret_cast<const Int8 *>(null_map) + start;
+    const Int8 * end_pos = pos + (end - start);
+
+#if defined(__SSE2__)
+    const Int8 * end_pos64 = pos + (end - start) / 64 * 64;
+
+    for (; pos < end_pos64; pos += 64, pos2 += 64)
+        count += std::popcount(toBits64(pos) & ~toBits64(pos2));
+
+        /// TODO Add duff device for tail?
+#endif
+
+    for (; pos < end_pos; ++pos, ++pos2)
+        count += (*pos & ~*pos2) != 0;
+
+    return count;
+}
+
+namespace
+{
+    /// Implementation details of filterArraysImpl function, used as template parameter.
+    /// Allow to build or not to build offsets array.
+
+    struct ResultOffsetsBuilder
+    {
+        PaddedPODArray<UInt64> & res_offsets;
+        XColumn::Offset current_src_offset = 0;
+
+        explicit ResultOffsetsBuilder(PaddedPODArray<UInt64> * res_offsets_) : res_offsets(*res_offsets_) {}
+
+        void reserve(ssize_t result_size_hint, size_t src_size)
+        {
+            res_offsets.reserve(result_size_hint > 0 ? result_size_hint : src_size);
+        }
+
+        void insertOne(size_t array_size)
+        {
+            current_src_offset += array_size;
+            res_offsets.push_back(current_src_offset);
+        }
+
+        template <size_t SIMD_BYTES>
+        void insertChunk(
+            const XColumn::Offset * src_offsets_pos,
+            bool first,
+            XColumn::Offset chunk_offset,
+            size_t chunk_size)
+        {
+            const auto offsets_size_old = res_offsets.size();
+            res_offsets.resize(offsets_size_old + SIMD_BYTES);
+            memcpy(&res_offsets[offsets_size_old], src_offsets_pos, SIMD_BYTES * sizeof(XColumn::Offset));
+
+            if (!first)
+            {
+                /// difference between current and actual offset
+                const auto diff_offset = chunk_offset - current_src_offset;
+
+                if (diff_offset > 0)
+                {
+                    auto * res_offsets_pos = &res_offsets[offsets_size_old];
+
+                    /// adjust offsets
+                    for (size_t i = 0; i < SIMD_BYTES; ++i)
+                        res_offsets_pos[i] -= diff_offset;
+                }
+            }
+            current_src_offset += chunk_size;
+        }
+    };
+
+    struct NoResultOffsetsBuilder
+    {
+        explicit NoResultOffsetsBuilder(PaddedPODArray<UInt64> *) {}
+        void reserve(ssize_t, size_t) {}
+        void insertOne(size_t) {}
+
+        template <size_t SIMD_BYTES>
+        void insertChunk(
+            const XColumn::Offset *,
+            bool,
+            XColumn::Offset,
+            size_t)
+        {
+        }
+    };
+
+
+    template <typename T, typename ResultOffsetsBuilder>
+    void filterArraysImplGeneric(
+        const PaddedPODArray<T> & src_elems, const PaddedPODArray<UInt64> & src_offsets,
+        PaddedPODArray<T> & res_elems, PaddedPODArray<UInt64> * res_offsets,
+        const XColumn::Filter & filt, ssize_t result_size_hint)
+    {
+        const size_t size = src_offsets.size();
+        if (size != filt.size())
+            throw Exception("Size of filter doesn't match size of column.");
+
+        ResultOffsetsBuilder result_offsets_builder(res_offsets);
+
+        if (result_size_hint)
+        {
+            result_offsets_builder.reserve(result_size_hint, size);
+
+            if (result_size_hint < 0)
+                res_elems.reserve(src_elems.size());
+            else if (result_size_hint < 1000000000 && src_elems.size() < 1000000000)    /// Avoid overflow.
+                res_elems.reserve((result_size_hint * src_elems.size() + size - 1) / size);
+        }
+
+        const UInt8 * filt_pos = filt.data();
+        const auto * filt_end = filt_pos + size;
+
+        const auto * offsets_pos = src_offsets.data();
+        const auto * offsets_begin = offsets_pos;
+
+        /// copy array ending at *end_offset_ptr
+        const auto copy_array = [&] (const XColumn::Offset * offset_ptr)
+        {
+            const auto arr_offset = offset_ptr == offsets_begin ? 0 : offset_ptr[-1];
+            const auto arr_size = *offset_ptr - arr_offset;
+
+            result_offsets_builder.insertOne(arr_size);
+
+            const auto elems_size_old = res_elems.size();
+            res_elems.resize(elems_size_old + arr_size);
+            memcpy(&res_elems[elems_size_old], &src_elems[arr_offset], arr_size * sizeof(T));
+        };
+
+    #ifdef __SSE2__
+        const __m128i zero_vec = _mm_setzero_si128();
+        static constexpr size_t SIMD_BYTES = 16;
+        const auto * filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
+
+        while (filt_pos < filt_end_aligned)
+        {
+            UInt16 mask = _mm_movemask_epi8(_mm_cmpeq_epi8(
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(filt_pos)),
+                zero_vec));
+            mask = ~mask;
+
+            if (mask == 0)
+            {
+                /// SIMD_BYTES consecutive rows do not pass the filter
+            }
+            else if (mask == 0xffff)
+            {
+                /// SIMD_BYTES consecutive rows pass the filter
+                const auto first = offsets_pos == offsets_begin;
+
+                const auto chunk_offset = first ? 0 : offsets_pos[-1];
+                const auto chunk_size = offsets_pos[SIMD_BYTES - 1] - chunk_offset;
+
+                result_offsets_builder.template insertChunk<SIMD_BYTES>(offsets_pos, first, chunk_offset, chunk_size);
+
+                /// copy elements for SIMD_BYTES arrays at once
+                const auto elems_size_old = res_elems.size();
+                res_elems.resize(elems_size_old + chunk_size);
+                memcpy(&res_elems[elems_size_old], &src_elems[chunk_offset], chunk_size * sizeof(T));
+            }
+            else
+            {
+                for (size_t i = 0; i < SIMD_BYTES; ++i)
+                    if (filt_pos[i])
+                        copy_array(offsets_pos + i);
+            }
+
+            filt_pos += SIMD_BYTES;
+            offsets_pos += SIMD_BYTES;
+        }
+    #endif
+
+        while (filt_pos < filt_end)
+        {
+            if (*filt_pos)
+                copy_array(offsets_pos);
+
+            ++filt_pos;
+            ++offsets_pos;
+        }
+    }
+}
+
+bool insertData(MutableColumn & column, const StringRef & value)
+{
+    switch (column.type()->id())
+    {
+        case arrow::Type::UINT8:
+            return insertNumber(column, *reinterpret_cast<const UInt8 *>(value.data));
+        case arrow::Type::UINT16:
+            return insertNumber(column, *reinterpret_cast<const UInt16 *>(value.data));
+        case arrow::Type::UINT32:
+            return insertNumber(column, *reinterpret_cast<const UInt32 *>(value.data));
+        case arrow::Type::UINT64:
+            return insertNumber(column, *reinterpret_cast<const UInt64 *>(value.data));
+
+        case arrow::Type::INT8:
+            return insertNumber(column, *reinterpret_cast<const Int8 *>(value.data));
+        case arrow::Type::INT16:
+            return insertNumber(column, *reinterpret_cast<const Int16 *>(value.data));
+        case arrow::Type::INT32:
+            return insertNumber(column, *reinterpret_cast<const Int32 *>(value.data));
+        case arrow::Type::INT64:
+            return insertNumber(column, *reinterpret_cast<const Int64 *>(value.data));
+
+        case arrow::Type::FLOAT:
+            return insertNumber(column, *reinterpret_cast<const float *>(value.data));
+        case arrow::Type::DOUBLE:
+            return insertNumber(column, *reinterpret_cast<const double *>(value.data));
+
+        case arrow::Type::FIXED_SIZE_BINARY:
+            return insertFixedString(column, value);
+
+        case arrow::Type::STRING:
+        case arrow::Type::BINARY:
+            return insertString(column, value);
+
+        case arrow::Type::EXTENSION: // AggregateColumn
+            break; // TODO
+
+        default:
+            break;
+    }
+
+    throw Exception(std::string(__FUNCTION__) + " unexpected type " + column.type()->ToString());
+}
+
+StringRef serializeValueIntoArena(const IColumn& column, size_t row, Arena & pool, char const *& begin)
+{
+    switch (column.type_id())
+    {
+        case arrow::Type::UINT8:
+            return serializeNumberIntoArena(assert_cast<const ColumnUInt8 &>(column).Value(row), pool, begin);
+        case arrow::Type::UINT16:
+            return serializeNumberIntoArena(assert_cast<const ColumnUInt16 &>(column).Value(row), pool, begin);
+        case arrow::Type::UINT32:
+            return serializeNumberIntoArena(assert_cast<const ColumnUInt32 &>(column).Value(row), pool, begin);
+        case arrow::Type::UINT64:
+            return serializeNumberIntoArena(assert_cast<const ColumnUInt64 &>(column).Value(row), pool, begin);
+
+        case arrow::Type::INT8:
+            return serializeNumberIntoArena(assert_cast<const ColumnInt8 &>(column).Value(row), pool, begin);
+        case arrow::Type::INT16:
+            return serializeNumberIntoArena(assert_cast<const ColumnInt16 &>(column).Value(row), pool, begin);
+        case arrow::Type::INT32:
+            return serializeNumberIntoArena(assert_cast<const ColumnInt32 &>(column).Value(row), pool, begin);
+        case arrow::Type::INT64:
+            return serializeNumberIntoArena(assert_cast<const ColumnInt64 &>(column).Value(row), pool, begin);
+
+        case arrow::Type::FLOAT:
+            return serializeNumberIntoArena(assert_cast<const ColumnFloat32 &>(column).Value(row), pool, begin);
+        case arrow::Type::DOUBLE:
+            return serializeNumberIntoArena(assert_cast<const ColumnFloat64 &>(column).Value(row), pool, begin);
+
+        case arrow::Type::FIXED_SIZE_BINARY:
+        {
+            auto str = assert_cast<const ColumnFixedString &>(column).GetView(row);
+            return serializeStringIntoArena<true>(StringRef(str.data(), str.size()), pool, begin);
+        }
+        case arrow::Type::STRING:
+        case arrow::Type::BINARY:
+        {
+            auto str = assert_cast<const ColumnBinary &>(column).GetView(row);
+            return serializeStringIntoArena<false>(StringRef(str.data(), str.size()), pool, begin);
+        }
+
+        case arrow::Type::EXTENSION: // AggregateColumn
+            break; // TODO
+
+        default:
+            break;
+    }
+
+    throw Exception(std::string(__FUNCTION__) + " unexpected type " + column.type()->ToString());
+}
+
+const char * deserializeAndInsertFromArena(MutableColumn& column, const char * pos)
+{
+    switch (column.type()->id())
+    {
+        case arrow::Type::UINT8:
+            return deserializeNumberFromArena(assert_cast<MutableColumnUInt8 &>(column), pos);
+        case arrow::Type::UINT16:
+            return deserializeNumberFromArena(assert_cast<MutableColumnUInt16 &>(column), pos);
+        case arrow::Type::UINT32:
+            return deserializeNumberFromArena(assert_cast<MutableColumnUInt32 &>(column), pos);
+        case arrow::Type::UINT64:
+            return deserializeNumberFromArena(assert_cast<MutableColumnUInt64 &>(column), pos);
+
+        case arrow::Type::INT8:
+            return deserializeNumberFromArena(assert_cast<MutableColumnInt8 &>(column), pos);
+        case arrow::Type::INT16:
+            return deserializeNumberFromArena(assert_cast<MutableColumnInt16 &>(column), pos);
+        case arrow::Type::INT32:
+            return deserializeNumberFromArena(assert_cast<MutableColumnInt32 &>(column), pos);
+        case arrow::Type::INT64:
+            return deserializeNumberFromArena(assert_cast<MutableColumnInt64 &>(column), pos);
+
+        case arrow::Type::FLOAT:
+            return deserializeNumberFromArena(assert_cast<MutableColumnFloat32 &>(column), pos);
+        case arrow::Type::DOUBLE:
+            return deserializeNumberFromArena(assert_cast<MutableColumnFloat64 &>(column), pos);
+
+        case arrow::Type::FIXED_SIZE_BINARY:
+            return deserializeStringFromArena(assert_cast<MutableColumnFixedString &>(column), pos);
+
+        case arrow::Type::STRING:
+        case arrow::Type::BINARY:
+            return deserializeStringFromArena(assert_cast<MutableColumnBinary &>(column), pos);
+
+        case arrow::Type::EXTENSION: // AggregateColumn
+            break; // TODO
+
+        default:
+            break;
+    }
+
+    throw Exception(std::string(__FUNCTION__) + " unexpected type " + column.type()->ToString());
+}
+
+void updateHashWithValue(const IColumn& column, size_t row, SipHash & hash)
+{
+    switch (column.type_id())
+    {
+        case arrow::Type::UINT8:
+            return hash.update(assert_cast<const ColumnUInt8 &>(column).Value(row));
+        case arrow::Type::UINT16:
+            return hash.update(assert_cast<const ColumnUInt16 &>(column).Value(row));
+        case arrow::Type::UINT32:
+            return hash.update(assert_cast<const ColumnUInt32 &>(column).Value(row));
+        case arrow::Type::UINT64:
+            return hash.update(assert_cast<const ColumnUInt64 &>(column).Value(row));
+
+        case arrow::Type::INT8:
+            return hash.update(assert_cast<const ColumnInt8 &>(column).Value(row));
+        case arrow::Type::INT16:
+            return hash.update(assert_cast<const ColumnInt16 &>(column).Value(row));
+        case arrow::Type::INT32:
+            return hash.update(assert_cast<const ColumnInt32 &>(column).Value(row));
+        case arrow::Type::INT64:
+            return hash.update(assert_cast<const ColumnInt64 &>(column).Value(row));
+
+        case arrow::Type::FLOAT:
+            return hash.update(assert_cast<const ColumnFloat32 &>(column).Value(row));
+        case arrow::Type::DOUBLE:
+            return hash.update(assert_cast<const ColumnFloat64 &>(column).Value(row));
+
+        case arrow::Type::FIXED_SIZE_BINARY:
+        {
+            auto str = assert_cast<const ColumnFixedString &>(column).GetView(row);
+            return hash.update(str.data(), str.size());
+        }
+        case arrow::Type::STRING:
+        case arrow::Type::BINARY:
+        {
+            auto str = assert_cast<const ColumnBinary &>(column).GetView(row);
+            return hash.update(str.data(), str.size());
+        }
+
+        case arrow::Type::EXTENSION: // AggregateColumn
+            break; // TODO
+
+        default:
+            break;
+    }
+
+    throw Exception(std::string(__FUNCTION__) + " unexpected type " + column.type()->ToString());
+}
+
+MutableColumnPtr createMutableColumn(const DataTypePtr & type)
+{
+    switch (type->id())
+    {
+        case arrow::Type::UINT8:
+            return std::make_shared<MutableColumnUInt8>();
+        case arrow::Type::UINT16:
+            return std::make_shared<MutableColumnUInt16>();
+        case arrow::Type::UINT32:
+            return std::make_shared<MutableColumnUInt32>();
+        case arrow::Type::UINT64:
+            return std::make_shared<MutableColumnUInt64>();
+
+        case arrow::Type::INT8:
+            return std::make_shared<MutableColumnInt8>();
+        case arrow::Type::INT16:
+            return std::make_shared<MutableColumnInt16>();
+        case arrow::Type::INT32:
+            return std::make_shared<MutableColumnInt32>();
+        case arrow::Type::INT64:
+            return std::make_shared<MutableColumnInt64>();
+
+        case arrow::Type::FLOAT:
+            return std::make_shared<MutableColumnFloat32>();
+        case arrow::Type::DOUBLE:
+            return std::make_shared<MutableColumnFloat64>();
+
+        case arrow::Type::FIXED_SIZE_BINARY:
+            return std::make_shared<MutableColumnFixedString>(type);
+
+        case arrow::Type::BINARY:
+            return std::make_shared<MutableColumnBinary>();
+        case arrow::Type::STRING:
+            return std::make_shared<MutableColumnString>();
+
+        case arrow::Type::EXTENSION: // AggregateColumn
+            break; // TODO: do we really need it here?
+
+        default:
+            break;
+    }
+
+    throw Exception(std::string(__FUNCTION__) + " unexpected type " + type->ToString());
+}
+
+uint32_t fixedContiguousSize(const DataTypePtr & type)
+{
+    switch (type->id())
+    {
+        case arrow::Type::UINT8:
+            return 1;
+        case arrow::Type::UINT16:
+            return 2;
+        case arrow::Type::UINT32:
+            return 4;
+        case arrow::Type::UINT64:
+            return 8;
+        case arrow::Type::INT8:
+            return 1;
+        case arrow::Type::INT16:
+            return 2;
+        case arrow::Type::INT32:
+            return 4;
+        case arrow::Type::INT64:
+            return 8;
+        case arrow::Type::FLOAT:
+            return 4;
+        case arrow::Type::DOUBLE:
+            return 8;
+
+        case arrow::Type::FIXED_SIZE_BINARY:
+            return std::static_pointer_cast<DataTypeFixedString>(type)->byte_width();
+
+        case arrow::Type::STRING:
+        case arrow::Type::BINARY:
+            break;
+
+        case arrow::Type::EXTENSION: // AggregateColumn
+            break;
+
+        default:
+            break;
+    }
+
+    return 0;
+}
+
+}
diff --git a/ydb/library/arrow_clickhouse/Columns/ColumnsCommon.h b/ydb/library/arrow_clickhouse/Columns/ColumnsCommon.h
new file mode 100644
index 00000000000..5eb633d15a0
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Columns/ColumnsCommon.h
@@ -0,0 +1,120 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+#include "arrow_clickhouse_types.h"
+
+#include <Common/SipHash.h>
+#include <Common/Arena.h>
+
+#include <common/StringRef.h>
+
+/// Common helper methods for implementation of different columns.
+
+namespace CH
+{
+
+/// Counts how many bytes of `filt` are greater than zero.
+size_t countBytesInFilter(const uint8_t * filt, size_t start, size_t end);
+size_t countBytesInFilterWithNull(const uint8_t * filt, const uint8_t * null_map, size_t start, size_t end);
+
+template <typename T>
+inline StringRef serializeNumberIntoArena(T value, Arena & arena, char const *& begin)
+{
+    auto * pos = arena.allocContinue(sizeof(T), begin);
+    unalignedStore<T>(pos, value);
+    return StringRef(pos, sizeof(T));
+}
+
+template <bool fixed>
+inline StringRef serializeStringIntoArena(const StringRef & str, Arena & arena, char const *& begin)
+{
+    if constexpr (fixed)
+    {
+        auto * pos = arena.allocContinue(str.size, begin);
+        memcpy(pos, str.data, str.size);
+        return StringRef(pos, str.size);
+    }
+    else
+    {
+        StringRef res;
+        res.size = sizeof(str.size) + str.size;
+        char * pos = arena.allocContinue(res.size, begin);
+        memcpy(pos, &str.size, sizeof(str.size));
+        memcpy(pos + sizeof(str.size), str.data, str.size);
+        res.data = pos;
+        return res;
+    }
+}
+
+template <typename T>
+inline bool insertNumber(MutableColumn & column, T value)
+{
+    if constexpr (std::is_same_v<T, UInt8>)
+        return assert_cast<MutableColumnUInt8 &>(column).Append(value).ok();
+    else if constexpr (std::is_same_v<T, UInt16>)
+        return assert_cast<MutableColumnUInt16 &>(column).Append(value).ok();
+    else if constexpr (std::is_same_v<T, UInt32>)
+        return assert_cast<MutableColumnUInt32 &>(column).Append(value).ok();
+    else if constexpr (std::is_same_v<T, UInt64>)
+        return assert_cast<MutableColumnUInt64 &>(column).Append(value).ok();
+    else if constexpr (std::is_same_v<T, Int8>)
+        return assert_cast<MutableColumnInt8 &>(column).Append(value).ok();
+    else if constexpr (std::is_same_v<T, Int16>)
+        return assert_cast<MutableColumnInt16 &>(column).Append(value).ok();
+    else if constexpr (std::is_same_v<T, Int32>)
+        return assert_cast<MutableColumnInt32 &>(column).Append(value).ok();
+    else if constexpr (std::is_same_v<T, Int64>)
+        return assert_cast<MutableColumnInt64 &>(column).Append(value).ok();
+    else if constexpr (std::is_same_v<T, float>)
+        return assert_cast<MutableColumnFloat32 &>(column).Append(value).ok();
+    else if constexpr (std::is_same_v<T, double>)
+        return assert_cast<MutableColumnFloat64 &>(column).Append(value).ok();
+
+    throw Exception("unexpected type");
+}
+
+inline bool insertString(MutableColumn & column, const StringRef & value)
+{
+    return assert_cast<MutableColumnBinary &>(column).Append(arrow::util::string_view{value.data, value.size}).ok();
+}
+
+inline bool insertFixedString(MutableColumn & column, const StringRef & value)
+{
+    return assert_cast<MutableColumnFixedString &>(column).Append(arrow::util::string_view{value.data, value.size}).ok();
+}
+
+template <typename DataType>
+inline const char * deserializeNumberFromArena(arrow::NumericBuilder<DataType> & column, const char * pos)
+{
+    using T = typename arrow::TypeTraits<DataType>::CType;
+
+    T value = unalignedLoad<T>(pos);
+    column.Append(value).ok();
+    return pos + sizeof(T);
+}
+
+inline const char * deserializeStringFromArena(MutableColumnBinary & column, const char * pos)
+{
+    const size_t string_size = unalignedLoad<size_t>(pos);
+    pos += sizeof(string_size);
+
+    column.Append(pos, string_size).ok();
+    return pos + string_size;
+}
+
+inline const char * deserializeStringFromArena(MutableColumnFixedString & column, const char * pos)
+{
+    column.Append(pos).ok();
+    return pos + column.byte_width();
+}
+
+bool insertData(MutableColumn & column, const StringRef & value);
+StringRef serializeValueIntoArena(const IColumn& column, size_t row, Arena & pool, char const *& begin);
+const char * deserializeAndInsertFromArena(MutableColumn& column, const char * pos);
+void updateHashWithValue(const IColumn& column, size_t row, SipHash & hash);
+MutableColumnPtr createMutableColumn(const DataTypePtr & type);
+uint32_t fixedContiguousSize(const DataTypePtr & type);
+
+}
diff --git a/ydb/library/arrow_clickhouse/Columns/ColumnsHashing.h b/ydb/library/arrow_clickhouse/Columns/ColumnsHashing.h
new file mode 100644
index 00000000000..dd3ce650f07
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Columns/ColumnsHashing.h
@@ -0,0 +1,293 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+#include "arrow_clickhouse_types.h"
+
+#include <Common/Arena.h>
+#include <Common/PODArray.h>
+#include <Common/HashTable/HashTable.h>
+#include <Common/HashTable/HashTableKeyHolder.h>
+#include <Columns/ColumnsHashingImpl.h>
+
+#include <common/unaligned.h>
+
+#include <memory>
+#include <cassert>
+
+
+namespace CH
+{
+
+namespace ColumnsHashing
+{
+
+/// For the case when there is one numeric key.
+/// UInt8/16/32/64 for any type with corresponding bit width.
+template <typename Value, typename Mapped, typename FieldType, bool use_cache = true, bool need_offset = false>
+struct HashMethodOneNumber
+    : public columns_hashing_impl::HashMethodBase<HashMethodOneNumber<Value, Mapped, FieldType, use_cache, need_offset>, Value, Mapped, use_cache, need_offset>
+{
+    using Self = HashMethodOneNumber<Value, Mapped, FieldType, use_cache, need_offset>;
+    using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache, need_offset>;
+
+    const uint8_t * vec{};
+
+    /// If the keys of a fixed length then key_sizes contains their lengths, empty otherwise.
+    HashMethodOneNumber(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &)
+    {
+        vec = assert_cast<const ColumnUInt8 *>(key_columns[0])->raw_values();
+    }
+
+    HashMethodOneNumber(const IColumn * column)
+    {
+        vec = assert_cast<const ColumnUInt8 *>(column)->raw_values();
+    }
+
+    /// Creates context. Method is called once and result context is used in all threads.
+    using Base::createContext; /// (const HashMethodContext::Settings &) -> HashMethodContextPtr
+
+    /// Emplace key into HashTable or HashMap. If Data is HashMap, returns ptr to value, otherwise nullptr.
+    /// Data is a HashTable where to insert key from column's row.
+    /// For Serialized method, key may be placed in pool.
+    using Base::emplaceKey; /// (Data & data, size_t row, Arena & pool) -> EmplaceResult
+
+    /// Find key into HashTable or HashMap. If Data is HashMap and key was found, returns ptr to value, otherwise nullptr.
+    using Base::findKey;  /// (Data & data, size_t row, Arena & pool) -> FindResult
+
+    /// Get hash value of row.
+    using Base::getHash; /// (const Data & data, size_t row, Arena & pool) -> size_t
+
+    /// Is used for default implementation in HashMethodBase.
+    FieldType getKeyHolder(size_t row, Arena &) const { return unalignedLoad<FieldType>(vec + row * sizeof(FieldType)); }
+
+    const FieldType * getKeyData() const { return reinterpret_cast<const FieldType *>(vec); }
+};
+
+
+/// For the case when there is one string key.
+template <typename Value, typename Mapped, bool place_string_to_arena = true, bool use_cache = true, bool need_offset = false>
+struct HashMethodString
+    : public columns_hashing_impl::HashMethodBase<HashMethodString<Value, Mapped, place_string_to_arena, use_cache, need_offset>, Value, Mapped, use_cache, need_offset>
+{
+    using Self = HashMethodString<Value, Mapped, place_string_to_arena, use_cache, need_offset>;
+    using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache, need_offset>;
+
+    const int * offsets{};
+    const uint8_t * chars{};
+
+    HashMethodString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &)
+    {
+        const IColumn & column = *key_columns[0];
+        const auto & column_string = assert_cast<const ColumnBinary &>(column);
+        offsets = column_string.raw_value_offsets();
+        chars = column_string.raw_data();
+    }
+
+    auto getKeyHolder(ssize_t row, [[maybe_unused]] Arena & pool) const
+    {
+        StringRef key(chars + offsets[row - 1], offsets[row] - offsets[row - 1] - 1);
+
+        if constexpr (place_string_to_arena)
+        {
+            return ArenaKeyHolder{key, pool};
+        }
+        else
+        {
+            return key;
+        }
+    }
+
+protected:
+    friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
+};
+
+
+/// For the case when there is one fixed-length string key.
+template <typename Value, typename Mapped, bool place_string_to_arena = true, bool use_cache = true, bool need_offset = false>
+struct HashMethodFixedString
+    : public columns_hashing_impl::
+          HashMethodBase<HashMethodFixedString<Value, Mapped, place_string_to_arena, use_cache, need_offset>, Value, Mapped, use_cache, need_offset>
+{
+    using Self = HashMethodFixedString<Value, Mapped, place_string_to_arena, use_cache, need_offset>;
+    using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache, need_offset>;
+
+    size_t n{};
+    const uint8_t * chars{};
+
+    HashMethodFixedString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &)
+    {
+        const IColumn & column = *key_columns[0];
+        const ColumnFixedString & column_string = assert_cast<const ColumnFixedString &>(column);
+        n = column_string.byte_width();
+        chars = column_string.raw_values();
+    }
+
+    auto getKeyHolder(size_t row, [[maybe_unused]] Arena & pool) const
+    {
+        StringRef key(&chars[row * n], n);
+
+        if constexpr (place_string_to_arena)
+        {
+            return ArenaKeyHolder{key, pool};
+        }
+        else
+        {
+            return key;
+        }
+    }
+
+protected:
+    friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
+};
+
+
+/// For the case when all keys are of fixed length, and they fit in N (for example, 128) bits.
+template <
+    typename Value,
+    typename Key,
+    typename Mapped,
+    bool has_nullable_keys_ = false,
+    bool has_low_cardinality_ = false,
+    bool use_cache = true,
+    bool need_offset = false>
+struct HashMethodKeysFixed
+    : private columns_hashing_impl::BaseStateKeysFixed<Key, has_nullable_keys_>
+    , public columns_hashing_impl::HashMethodBase<HashMethodKeysFixed<Value, Key, Mapped, has_nullable_keys_, has_low_cardinality_, use_cache, need_offset>, Value, Mapped, use_cache, need_offset>
+{
+    using Self = HashMethodKeysFixed<Value, Key, Mapped, has_nullable_keys_, has_low_cardinality_, use_cache, need_offset>;
+    using BaseHashed = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache, need_offset>;
+    using Base = columns_hashing_impl::BaseStateKeysFixed<Key, has_nullable_keys_>;
+
+    static constexpr bool has_nullable_keys = has_nullable_keys_;
+
+    Sizes key_sizes;
+    size_t keys_size;
+
+#if 0 // shuffleKeyColumns disabled
+    PaddedPODArray<Key> prepared_keys;
+
+    static bool usePreparedKeys(const Sizes & key_sizes)
+    {
+        if (has_nullable_keys || sizeof(Key) > 16)
+            return false;
+
+        for (auto size : key_sizes)
+            if (size != 1 && size != 2 && size != 4 && size != 8 && size != 16)
+                return false;
+
+        return true;
+    }
+#endif
+
+    HashMethodKeysFixed(const ColumnRawPtrs & key_columns, const Sizes & key_sizes_, const HashMethodContextPtr &)
+        : Base(key_columns), key_sizes(std::move(key_sizes_)), keys_size(key_columns.size())
+    {
+#if 0
+        if (usePreparedKeys(key_sizes))
+            packFixedBatch(keys_size, Base::getActualColumns(), key_sizes, prepared_keys);
+#endif
+    }
+
+    ALWAYS_INLINE Key getKeyHolder(size_t row, Arena &) const
+    {
+        if constexpr (has_nullable_keys)
+        {
+            auto bitmap = Base::createBitmap(row);
+            return packFixed<Key>(row, keys_size, Base::getActualColumns(), key_sizes, bitmap);
+        }
+        else
+        {
+#if 0
+            if (!prepared_keys.empty())
+                return prepared_keys[row];
+#endif
+            return packFixed<Key>(row, keys_size, Base::getActualColumns(), key_sizes);
+        }
+    }
+#if 0
+    static std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> & key_columns, const Sizes & key_sizes)
+    {
+        if (!usePreparedKeys(key_sizes))
+            return {};
+
+        std::vector<IColumn *> new_columns;
+        new_columns.reserve(key_columns.size());
+
+        Sizes new_sizes;
+        auto fill_size = [&](size_t size)
+        {
+            for (size_t i = 0; i < key_sizes.size(); ++i)
+            {
+                if (key_sizes[i] == size)
+                {
+                    new_columns.push_back(key_columns[i]);
+                    new_sizes.push_back(size);
+                }
+            }
+        };
+
+        fill_size(16);
+        fill_size(8);
+        fill_size(4);
+        fill_size(2);
+        fill_size(1);
+
+        key_columns.swap(new_columns);
+        return new_sizes;
+    }
+#endif
+};
+
+/** Hash by concatenating serialized key values.
+  * The serialized value differs in that it uniquely allows to deserialize it, having only the position with which it starts.
+  * That is, for example, for strings, it contains first the serialized length of the string, and then the bytes.
+  * Therefore, when aggregating by several strings, there is no ambiguity.
+  */
+template <typename Value, typename Mapped>
+struct HashMethodSerialized
+    : public columns_hashing_impl::HashMethodBase<HashMethodSerialized<Value, Mapped>, Value, Mapped, false>
+{
+    using Self = HashMethodSerialized<Value, Mapped>;
+    using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;
+
+    ColumnRawPtrs key_columns;
+    size_t keys_size;
+
+    HashMethodSerialized(const ColumnRawPtrs & key_columns_, const Sizes & /*key_sizes*/, const HashMethodContextPtr &)
+        : key_columns(key_columns_), keys_size(key_columns_.size()) {}
+
+protected:
+    friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;
+
+    ALWAYS_INLINE SerializedKeyHolder getKeyHolder(size_t row, Arena & pool) const
+    {
+        return SerializedKeyHolder{
+            serializeKeysToPoolContiguous(row, keys_size, key_columns, pool),
+            pool};
+    }
+};
+
+/// For the case when there is one string key.
+template <typename Value, typename Mapped, bool use_cache = true, bool need_offset = false>
+struct HashMethodHashed
+    : public columns_hashing_impl::HashMethodBase<HashMethodHashed<Value, Mapped, use_cache, need_offset>, Value, Mapped, use_cache, need_offset>
+{
+    using Key = UInt128;
+    using Self = HashMethodHashed<Value, Mapped, use_cache, need_offset>;
+    using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache, need_offset>;
+
+    ColumnRawPtrs key_columns;
+
+    HashMethodHashed(ColumnRawPtrs key_columns_, const Sizes &, const HashMethodContextPtr &)
+        : key_columns(std::move(key_columns_)) {}
+
+    ALWAYS_INLINE Key getKeyHolder(size_t row, Arena &) const
+    {
+        return hash128(row, key_columns.size(), key_columns);
+    }
+};
+
+}
+}
diff --git a/ydb/library/arrow_clickhouse/Columns/ColumnsHashingImpl.h b/ydb/library/arrow_clickhouse/Columns/ColumnsHashingImpl.h
new file mode 100644
index 00000000000..b3f2adf2006
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Columns/ColumnsHashingImpl.h
@@ -0,0 +1,375 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+#include "arrow_clickhouse_types.h"
+
+#include "AggregationCommon.h"
+#include <Common/HashTable/HashTableKeyHolder.h>
+
+namespace CH
+{
+
+namespace ColumnsHashing
+{
+
+/// Generic context for HashMethod. Context is shared between multiple threads, all methods must be thread-safe.
+/// Is used for caching.
+class HashMethodContext
+{
+public:
+    virtual ~HashMethodContext() = default;
+
+    struct Settings
+    {
+        size_t max_threads;
+    };
+};
+
+using HashMethodContextPtr = std::shared_ptr<HashMethodContext>;
+
+
+namespace columns_hashing_impl
+{
+
+template <typename Value, bool consecutive_keys_optimization_>
+struct LastElementCache
+{
+    static constexpr bool consecutive_keys_optimization = consecutive_keys_optimization_;
+    Value value;
+    bool empty = true;
+    bool found = false;
+
+    bool check(const Value & value_) { return !empty && value == value_; }
+
+    template <typename Key>
+    bool check(const Key & key) { return !empty && value.first == key; }
+};
+
+template <typename Data>
+struct LastElementCache<Data, false>
+{
+    static constexpr bool consecutive_keys_optimization = false;
+};
+
+template <typename Mapped>
+class EmplaceResultImpl
+{
+    Mapped & value;
+    Mapped & cached_value;
+    bool inserted;
+
+public:
+    EmplaceResultImpl(Mapped & value_, Mapped & cached_value_, bool inserted_)
+            : value(value_), cached_value(cached_value_), inserted(inserted_) {}
+
+    bool isInserted() const { return inserted; }
+    auto & getMapped() const { return value; }
+
+    void setMapped(const Mapped & mapped)
+    {
+        cached_value = mapped;
+        value = mapped;
+    }
+};
+
+template <>
+class EmplaceResultImpl<void>
+{
+    bool inserted;
+
+public:
+    explicit EmplaceResultImpl(bool inserted_) : inserted(inserted_) {}
+    bool isInserted() const { return inserted; }
+};
+
+/// FindResult optionally may contain pointer to value and offset in hashtable buffer.
+/// Only bool found is required.
+/// So we will have 4 different specializations for FindResultImpl
+class FindResultImplBase
+{
+    bool found;
+
+public:
+    explicit FindResultImplBase(bool found_) : found(found_) {}
+    bool isFound() const { return found; }
+};
+
+template <bool need_offset = false>
+class FindResultImplOffsetBase
+{
+public:
+    constexpr static bool has_offset = need_offset;
+    explicit FindResultImplOffsetBase(size_t /* off */) {}
+};
+
+template <>
+class FindResultImplOffsetBase<true>
+{
+    size_t offset;
+public:
+    constexpr static bool has_offset = true;
+
+    explicit FindResultImplOffsetBase(size_t off) : offset(off) {}
+    ALWAYS_INLINE size_t getOffset() const { return offset; }
+};
+
+template <typename Mapped, bool need_offset = false>
+class FindResultImpl : public FindResultImplBase, public FindResultImplOffsetBase<need_offset>
+{
+    Mapped * value;
+
+public:
+    FindResultImpl()
+        : FindResultImplBase(false), FindResultImplOffsetBase<need_offset>(0)
+    {}
+
+    FindResultImpl(Mapped * value_, bool found_, size_t off)
+        : FindResultImplBase(found_), FindResultImplOffsetBase<need_offset>(off), value(value_) {}
+    Mapped & getMapped() const { return *value; }
+};
+
+template <bool need_offset>
+class FindResultImpl<void, need_offset> : public FindResultImplBase, public FindResultImplOffsetBase<need_offset>
+{
+public:
+    FindResultImpl(bool found_, size_t off) : FindResultImplBase(found_), FindResultImplOffsetBase<need_offset>(off) {}
+};
+
+template <typename Derived, typename Value, typename Mapped, bool consecutive_keys_optimization, bool need_offset = false>
+class HashMethodBase
+{
+public:
+    using EmplaceResult = EmplaceResultImpl<Mapped>;
+    using FindResult = FindResultImpl<Mapped, need_offset>;
+    static constexpr bool has_mapped = !std::is_same<Mapped, void>::value;
+    using Cache = LastElementCache<Value, consecutive_keys_optimization>;
+
+    static HashMethodContextPtr createContext(const HashMethodContext::Settings &) { return nullptr; }
+
+    template <typename Data>
+    ALWAYS_INLINE EmplaceResult emplaceKey(Data & data, size_t row, Arena & pool)
+    {
+        auto key_holder = static_cast<Derived &>(*this).getKeyHolder(row, pool);
+        return emplaceImpl(key_holder, data);
+    }
+
+    template <typename Data>
+    ALWAYS_INLINE FindResult findKey(Data & data, size_t row, Arena & pool)
+    {
+        auto key_holder = static_cast<Derived &>(*this).getKeyHolder(row, pool);
+        return findKeyImpl(keyHolderGetKey(key_holder), data);
+    }
+
+    template <typename Data>
+    ALWAYS_INLINE size_t getHash(const Data & data, size_t row, Arena & pool)
+    {
+        auto key_holder = static_cast<Derived &>(*this).getKeyHolder(row, pool);
+        return data.hash(keyHolderGetKey(key_holder));
+    }
+
+protected:
+    Cache cache;
+
+    HashMethodBase()
+    {
+        if constexpr (consecutive_keys_optimization)
+        {
+            if constexpr (has_mapped)
+            {
+                /// Init PairNoInit elements.
+                cache.value.second = Mapped();
+                cache.value.first = {};
+            }
+            else
+                cache.value = Value();
+        }
+    }
+
+    template <typename Data, typename KeyHolder>
+    ALWAYS_INLINE EmplaceResult emplaceImpl(KeyHolder & key_holder, Data & data)
+    {
+        if constexpr (Cache::consecutive_keys_optimization)
+        {
+            if (cache.found && cache.check(keyHolderGetKey(key_holder)))
+            {
+                if constexpr (has_mapped)
+                    return EmplaceResult(cache.value.second, cache.value.second, false);
+                else
+                    return EmplaceResult(false);
+            }
+        }
+
+        typename Data::LookupResult it;
+        bool inserted = false;
+        data.emplace(key_holder, it, inserted);
+
+        [[maybe_unused]] Mapped * cached = nullptr;
+        if constexpr (has_mapped)
+            cached = &it->getMapped();
+
+        if (inserted)
+        {
+            if constexpr (has_mapped)
+            {
+                new (&it->getMapped()) Mapped();
+            }
+        }
+
+        if constexpr (consecutive_keys_optimization)
+        {
+            cache.found = true;
+            cache.empty = false;
+
+            if constexpr (has_mapped)
+            {
+                cache.value.first = it->getKey();
+                cache.value.second = it->getMapped();
+                cached = &cache.value.second;
+            }
+            else
+            {
+                cache.value = it->getKey();
+            }
+        }
+
+        if constexpr (has_mapped)
+            return EmplaceResult(it->getMapped(), *cached, inserted);
+        else
+            return EmplaceResult(inserted);
+    }
+
+    template <typename Data, typename Key>
+    ALWAYS_INLINE FindResult findKeyImpl(Key key, Data & data)
+    {
+        if constexpr (Cache::consecutive_keys_optimization)
+        {
+            /// It's possible to support such combination, but code will became more complex.
+            /// Now there's not place where we need this options enabled together
+            static_assert(!FindResult::has_offset, "`consecutive_keys_optimization` and `has_offset` are conflicting options");
+            if (cache.check(key))
+            {
+                if constexpr (has_mapped)
+                    return FindResult(&cache.value.second, cache.found, 0);
+                else
+                    return FindResult(cache.found, 0);
+            }
+        }
+
+        auto it = data.find(key);
+
+        if constexpr (consecutive_keys_optimization)
+        {
+            cache.found = it != nullptr;
+            cache.empty = false;
+
+            if constexpr (has_mapped)
+            {
+                cache.value.first = key;
+                if (it)
+                {
+                    cache.value.second = it->getMapped();
+                }
+            }
+            else
+            {
+                cache.value = key;
+            }
+        }
+
+        size_t offset = 0;
+        if constexpr (FindResult::has_offset)
+        {
+            offset = it ? data.offsetInternal(it) : 0;
+        }
+        if constexpr (has_mapped)
+            return FindResult(it ? &it->getMapped() : nullptr, it != nullptr, offset);
+        else
+            return FindResult(it != nullptr, offset);
+    }
+};
+
+
+template <typename T>
+struct MappedCache : public PaddedPODArray<T> {};
+
+template <>
+struct MappedCache<void> {};
+
+
+/// This class is designed to provide the functionality that is required for
+/// supporting nullable keys in HashMethodKeysFixed. If there are
+/// no nullable keys, this class is merely implemented as an empty shell.
+template <typename Key, bool has_nullable_keys>
+class BaseStateKeysFixed;
+
+/// Case where nullable keys are supported.
+template <typename Key>
+class BaseStateKeysFixed<Key, true>
+{
+protected:
+    BaseStateKeysFixed(const ColumnRawPtrs & key_columns)
+    {
+        actual_columns.reserve(key_columns.size());
+        for (const auto & col : key_columns)
+        {
+            actual_columns.push_back(col);
+        }
+    }
+
+    /// Return the columns which actually contain the values of the keys.
+    /// For a given key column, if it is nullable, we return its nested
+    /// column. Otherwise we return the key column itself.
+    inline const ColumnRawPtrs & getActualColumns() const
+    {
+        return actual_columns;
+    }
+
+    /// Create a bitmap that indicates whether, for a particular row,
+    /// a key column bears a null value or not.
+    KeysNullMap<Key> createBitmap(size_t row) const
+    {
+        KeysNullMap<Key> bitmap{};
+
+        for (size_t k = 0; k < actual_columns.size(); ++k)
+        {
+            bool is_null = actual_columns[k]->IsNull(row);
+            if (is_null)
+            {
+                size_t bucket = k / 8;
+                size_t offset = k % 8;
+                bitmap[bucket] |= UInt8(1) << offset;
+            }
+        }
+
+        return bitmap;
+    }
+
+private:
+    ColumnRawPtrs actual_columns;
+};
+
+/// Case where nullable keys are not supported.
+template <typename Key>
+class BaseStateKeysFixed<Key, false>
+{
+protected:
+    BaseStateKeysFixed(const ColumnRawPtrs & columns) : actual_columns(columns) {}
+
+    const ColumnRawPtrs & getActualColumns() const { return actual_columns; }
+
+    KeysNullMap<Key> createBitmap(size_t) const
+    {
+        throw Exception{"Internal error: calling createBitmap() for non-nullable keys is forbidden"};
+    }
+
+private:
+    ColumnRawPtrs actual_columns;
+};
+
+}
+
+}
+
+}
diff --git a/ydb/library/arrow_clickhouse/Common/Allocator.cpp b/ydb/library/arrow_clickhouse/Common/Allocator.cpp
new file mode 100644
index 00000000000..96a2ae6ad0a
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Common/Allocator.cpp
@@ -0,0 +1,35 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#include <Common/Allocator.h>
+
+namespace CH
+{
+
+/** Keep definition of this constant in cpp file; otherwise its value
+  * is inlined into allocator code making it impossible to override it
+  * in third-party code.
+  *
+  * Note: extern may seem redundant, but is actually needed due to bug in GCC.
+  * See also: https://gcc.gnu.org/legacy-ml/gcc-help/2017-12/msg00021.html
+  */
+#ifdef NDEBUG
+    __attribute__((__weak__)) extern const size_t MMAP_THRESHOLD = 64 * (1ULL << 20);
+#else
+    /**
+      * In debug build, use small mmap threshold to reproduce more memory
+      * stomping bugs. Along with ASLR it will hopefully detect more issues than
+      * ASan. The program may fail due to the limit on number of memory mappings.
+      *
+      * Not too small to avoid too quick exhaust of memory mappings.
+      */
+    __attribute__((__weak__)) extern const size_t MMAP_THRESHOLD = 16384;
+#endif
+
+template class Allocator<false, false>;
+template class Allocator<true, false>;
+template class Allocator<false, true>;
+template class Allocator<true, true>;
+
+}
diff --git a/ydb/library/arrow_clickhouse/Common/Allocator.h b/ydb/library/arrow_clickhouse/Common/Allocator.h
new file mode 100644
index 00000000000..b9c1aa1247c
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Common/Allocator.h
@@ -0,0 +1,344 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+
+#include <string.h>
+
+#ifdef NDEBUG
+    #define ALLOCATOR_ASLR 0
+#else
+    #define ALLOCATOR_ASLR 1
+#endif
+
+#if !defined(__APPLE__) && !defined(__FreeBSD__)
+#include <malloc.h>
+#endif
+
+#include <cstdlib>
+#include <algorithm>
+#include <sys/mman.h>
+
+
+#include <common/mremap.h>
+#include <common/getPageSize.h>
+
+#include <Common/Allocator_fwd.h>
+
+/// Required for older Darwin builds, that lack definition of MAP_ANONYMOUS
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
+namespace CH
+{
+
+/**
+  * Many modern allocators (for example, tcmalloc) do not do a mremap for
+  * realloc, even in case of large enough chunks of memory. Although this allows
+  * you to increase performance and reduce memory consumption during realloc.
+  * To fix this, we do mremap manually if the chunk of memory is large enough.
+  * The threshold (64 MB) is chosen quite large, since changing the address
+  * space is very slow, especially in the case of a large number of threads. We
+  * expect that the set of operations mmap/something to do/mremap can only be
+  * performed about 1000 times per second.
+  *
+  * P.S. This is also required, because tcmalloc can not allocate a chunk of
+  * memory greater than 16 GB.
+  *
+  * P.P.S. Note that MMAP_THRESHOLD symbol is intentionally made weak. It allows
+  * to override it during linkage when using ClickHouse as a library in
+  * third-party applications which may already use own allocator doing mmaps
+  * in the implementation of alloc/realloc.
+  */
+extern const size_t MMAP_THRESHOLD;
+
+static constexpr size_t MALLOC_MIN_ALIGNMENT = 8;
+
+
+/** Responsible for allocating / freeing memory. Used, for example, in PODArray, Arena.
+  * Also used in hash tables.
+  * The interface is different from std::allocator
+  * - the presence of the method realloc, which for large chunks of memory uses mremap;
+  * - passing the size into the `free` method;
+  * - by the presence of the `alignment` argument;
+  * - the possibility of zeroing memory (used in hash tables);
+  * - random hint address for mmap
+  * - mmap_threshold for using mmap less or more
+  */
+template <bool clear_memory_, bool mmap_populate>
+class Allocator
+{
+public:
+    /// Allocate memory range.
+    void * alloc(size_t size, size_t alignment = 0)
+    {
+        checkSize(size);
+        return allocNoTrack(size, alignment);
+    }
+
+    /// Free memory range.
+    void free(void * buf, size_t size)
+    {
+        try
+        {
+            checkSize(size);
+            freeNoTrack(buf, size);
+        }
+        catch (...)
+        {
+            //DB::tryLogCurrentException("Allocator::free");
+            throw;
+        }
+    }
+
+    /** Enlarge memory range.
+      * Data from old range is moved to the beginning of new range.
+      * Address of memory range could change.
+      */
+    void * realloc(void * buf, size_t old_size, size_t new_size, size_t alignment = 0)
+    {
+        checkSize(new_size);
+
+        if (old_size == new_size)
+        {
+            /// nothing to do.
+            /// BTW, it's not possible to change alignment while doing realloc.
+        }
+        else if (old_size < MMAP_THRESHOLD && new_size < MMAP_THRESHOLD
+                 && alignment <= MALLOC_MIN_ALIGNMENT)
+        {
+            void * new_buf = ::realloc(buf, new_size);
+            if (nullptr == new_buf)
+                throw std::runtime_error("Allocator: Cannot realloc");
+
+            buf = new_buf;
+            if constexpr (clear_memory)
+                if (new_size > old_size)
+                    memset(reinterpret_cast<char *>(buf) + old_size, 0, new_size - old_size);
+        }
+        else if (old_size >= MMAP_THRESHOLD && new_size >= MMAP_THRESHOLD)
+        {
+            // On apple and freebsd self-implemented mremap used (common/mremap.h)
+            buf = clickhouse_mremap(buf, old_size, new_size, MREMAP_MAYMOVE,
+                                    PROT_READ | PROT_WRITE, mmap_flags, -1, 0);
+            if (MAP_FAILED == buf)
+                throw std::runtime_error("Allocator: Cannot mremap memory");
+
+            /// No need for zero-fill, because mmap guarantees it.
+        }
+        else if (new_size < MMAP_THRESHOLD)
+        {
+            void * new_buf = allocNoTrack(new_size, alignment);
+            memcpy(new_buf, buf, std::min(old_size, new_size));
+            freeNoTrack(buf, old_size);
+            buf = new_buf;
+        }
+        else
+        {
+            /// Big allocs that requires a copy. MemoryTracker is called inside 'alloc', 'free' methods.
+
+            void * new_buf = alloc(new_size, alignment);
+            memcpy(new_buf, buf, std::min(old_size, new_size));
+            free(buf, old_size);
+            buf = new_buf;
+        }
+
+        return buf;
+    }
+
+protected:
+    static constexpr size_t getStackThreshold()
+    {
+        return 0;
+    }
+
+    static constexpr bool clear_memory = clear_memory_;
+
+    // Freshly mmapped pages are copy-on-write references to a global zero page.
+    // On the first write, a page fault occurs, and an actual writable page is
+    // allocated. If we are going to use this memory soon, such as when resizing
+    // hash tables, it makes sense to pre-fault the pages by passing
+    // MAP_POPULATE to mmap(). This takes some time, but should be faster
+    // overall than having a hot loop interrupted by page faults.
+    // It is only supported on Linux.
+    static constexpr int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS
+#if defined(OS_LINUX)
+        | (mmap_populate ? MAP_POPULATE : 0)
+#endif
+        ;
+
+private:
+    void * allocNoTrack(size_t size, size_t alignment)
+    {
+        void * buf;
+        size_t mmap_min_alignment = ::getPageSize();
+
+        if (size >= MMAP_THRESHOLD)
+        {
+            if (alignment > mmap_min_alignment)
+                throw std::runtime_error("Too large alignment: more than page size when allocating");
+
+            buf = mmap(getMmapHint(), size, PROT_READ | PROT_WRITE,
+                       mmap_flags, -1, 0);
+            if (MAP_FAILED == buf)
+                throw std::runtime_error("Allocator: Cannot mmap");
+
+            /// No need for zero-fill, because mmap guarantees it.
+        }
+        else
+        {
+            if (alignment <= MALLOC_MIN_ALIGNMENT)
+            {
+                if constexpr (clear_memory)
+                    buf = ::calloc(size, 1);
+                else
+                    buf = ::malloc(size);
+
+                if (nullptr == buf)
+                    throw std::runtime_error("Allocator: Cannot malloc");
+            }
+            else
+            {
+                buf = nullptr;
+                int res = posix_memalign(&buf, alignment, size);
+
+                if (0 != res)
+                    throw std::runtime_error("Cannot allocate memory (posix_memalign)");
+
+                if constexpr (clear_memory)
+                    memset(buf, 0, size);
+            }
+        }
+        return buf;
+    }
+
+    void freeNoTrack(void * buf, size_t size)
+    {
+        if (size >= MMAP_THRESHOLD)
+        {
+            if (0 != munmap(buf, size))
+                throw std::runtime_error("Allocator: Cannot munmap");
+        }
+        else
+        {
+            ::free(buf);
+        }
+    }
+
+    void checkSize(size_t size)
+    {
+        /// More obvious exception in case of possible overflow (instead of just "Cannot mmap").
+        if (size >= 0x8000000000000000ULL)
+            throw std::runtime_error("Too large size passed to allocator. It indicates an error.");
+    }
+
+#ifndef NDEBUG
+    /// In debug builds, request mmap() at random addresses (a kind of ASLR), to
+    /// reproduce more memory stomping bugs. Note that Linux doesn't do it by
+    /// default. This may lead to worse TLB performance.
+    void * getMmapHint()
+    {
+        //return reinterpret_cast<void *>(std::uniform_int_distribution<intptr_t>(0x100000000000UL, 0x700000000000UL)(thread_local_rng));
+        return nullptr;
+    }
+#else
+    void * getMmapHint()
+    {
+        return nullptr;
+    }
+#endif
+};
+
+/** When using AllocatorWithStackMemory, located on the stack,
+  *  GCC 4.9 mistakenly assumes that we can call `free` from a pointer to the stack.
+  * In fact, the combination of conditions inside AllocatorWithStackMemory does not allow this.
+  */
+#if !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfree-nonheap-object"
+#endif
+
+/** Allocator with optimization to place small memory ranges in automatic memory.
+  */
+template <typename Base, size_t _initial_bytes, size_t Alignment>
+class AllocatorWithStackMemory : private Base
+{
+private:
+    alignas(Alignment) char stack_memory[_initial_bytes];
+
+public:
+    static constexpr size_t initial_bytes = _initial_bytes;
+
+    /// Do not use boost::noncopyable to avoid the warning about direct base
+    /// being inaccessible due to ambiguity, when derived classes are also
+    /// noncopiable (-Winaccessible-base).
+    AllocatorWithStackMemory(const AllocatorWithStackMemory&) = delete;
+    AllocatorWithStackMemory & operator = (const AllocatorWithStackMemory&) = delete;
+    AllocatorWithStackMemory() = default;
+    ~AllocatorWithStackMemory() = default;
+
+    void * alloc(size_t size)
+    {
+        if (size <= initial_bytes)
+        {
+            if constexpr (Base::clear_memory)
+                memset(stack_memory, 0, initial_bytes);
+            return stack_memory;
+        }
+
+        return Base::alloc(size, Alignment);
+    }
+
+    void free(void * buf, size_t size)
+    {
+        if (size > initial_bytes)
+            Base::free(buf, size);
+    }
+
+    void * realloc(void * buf, size_t old_size, size_t new_size)
+    {
+        /// Was in stack_memory, will remain there.
+        if (new_size <= initial_bytes)
+            return buf;
+
+        /// Already was big enough to not fit in stack_memory.
+        if (old_size > initial_bytes)
+            return Base::realloc(buf, old_size, new_size, Alignment);
+
+        /// Was in stack memory, but now will not fit there.
+        void * new_buf = Base::alloc(new_size, Alignment);
+        memcpy(new_buf, buf, old_size);
+        return new_buf;
+    }
+
+protected:
+    static constexpr size_t getStackThreshold()
+    {
+        return initial_bytes;
+    }
+};
+
+// A constant that gives the number of initially available bytes in
+// the allocator. Used to check that this number is in sync with the
+// initial size of array or hash table that uses the allocator.
+template<typename TAllocator>
+constexpr size_t allocatorInitialBytes = 0;
+
+template<typename Base, size_t initial_bytes, size_t Alignment>
+constexpr size_t allocatorInitialBytes<AllocatorWithStackMemory<
+    Base, initial_bytes, Alignment>> = initial_bytes;
+
+/// Prevent implicit template instantiation of Allocator
+
+extern template class Allocator<false, false>;
+extern template class Allocator<true, false>;
+extern template class Allocator<false, true>;
+extern template class Allocator<true, true>;
+
+}
+
+#if !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
diff --git a/ydb/library/arrow_clickhouse/Common/Allocator_fwd.h b/ydb/library/arrow_clickhouse/Common/Allocator_fwd.h
new file mode 100644
index 00000000000..6e1b6b62257
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Common/Allocator_fwd.h
@@ -0,0 +1,19 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+/**
+  * This file provides forward declarations for Allocator.
+  */
+
+namespace CH
+{
+
+template <bool clear_memory_, bool mmap_populate = false>
+class Allocator;
+
+template <typename Base, size_t N = 64, size_t Alignment = 1>
+class AllocatorWithStackMemory;
+
+}
diff --git a/ydb/library/arrow_clickhouse/Common/Arena.h b/ydb/library/arrow_clickhouse/Common/Arena.h
new file mode 100644
index 00000000000..7f86e70e5ae
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Common/Arena.h
@@ -0,0 +1,311 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+
+#include <string.h>
+#include <memory>
+#include <vector>
+
+#include <Common/memcpySmall.h>
+#include <Common/Allocator.h>
+
+namespace CH
+{
+
+/** Memory pool to append something. For example, short strings.
+  * Usage scenario:
+  * - put lot of strings inside pool, keep their addresses;
+  * - addresses remain valid during lifetime of pool;
+  * - at destruction of pool, all memory is freed;
+  * - memory is allocated and freed by large MemoryChunks;
+  * - freeing parts of data is not possible (but look at ArenaWithFreeLists if you need);
+  */
+class Arena //: private boost::noncopyable
+{
+private:
+    /// Padding allows to use 'memcpySmallAllowReadWriteOverflow15' instead of 'memcpy'.
+    static constexpr size_t pad_right = 15;
+
+    /// Contiguous MemoryChunk of memory and pointer to free space inside it. Member of single-linked list.
+    struct alignas(16) MemoryChunk : private Allocator<false>    /// empty base optimization
+    {
+        char * begin;
+        char * pos;
+        char * end; /// does not include padding.
+
+        MemoryChunk * prev;
+
+        MemoryChunk(size_t size_, MemoryChunk * prev_)
+        {
+            begin = reinterpret_cast<char *>(Allocator<false>::alloc(size_));
+            pos = begin;
+            end = begin + size_ - pad_right;
+            prev = prev_;
+
+            //ASAN_POISON_MEMORY_REGION(begin, size_);
+        }
+
+        ~MemoryChunk()
+        {
+            /// We must unpoison the memory before returning to the allocator,
+            /// because the allocator might not have asan integration, and the
+            /// memory would stay poisoned forever. If the allocator supports
+            /// asan, it will correctly poison the memory by itself.
+            //ASAN_UNPOISON_MEMORY_REGION(begin, size());
+
+            Allocator<false>::free(begin, size());
+
+            if (prev)
+                delete prev;
+        }
+
+        size_t size() const { return end + pad_right - begin; }
+        size_t remaining() const { return end - pos; }
+    };
+
+    size_t growth_factor;
+    size_t linear_growth_threshold;
+
+    /// Last contiguous MemoryChunk of memory.
+    MemoryChunk * head;
+    size_t size_in_bytes;
+    size_t page_size;
+
+    static size_t roundUpToPageSize(size_t s, size_t page_size)
+    {
+        return (s + page_size - 1) / page_size * page_size;
+    }
+
+    /// If MemoryChunks size is less than 'linear_growth_threshold', then use exponential growth, otherwise - linear growth
+    ///  (to not allocate too much excessive memory).
+    size_t nextSize(size_t min_next_size) const
+    {
+        size_t size_after_grow = 0;
+
+        if (head->size() < linear_growth_threshold)
+        {
+            size_after_grow = std::max(min_next_size, head->size() * growth_factor);
+        }
+        else
+        {
+            // allocContinue() combined with linear growth results in quadratic
+            // behavior: we append the data by small amounts, and when it
+            // doesn't fit, we create a new MemoryChunk and copy all the previous data
+            // into it. The number of times we do this is directly proportional
+            // to the total size of data that is going to be serialized. To make
+            // the copying happen less often, round the next size up to the
+            // linear_growth_threshold.
+            size_after_grow = ((min_next_size + linear_growth_threshold - 1)
+                    / linear_growth_threshold) * linear_growth_threshold;
+        }
+
+        assert(size_after_grow >= min_next_size);
+        return roundUpToPageSize(size_after_grow, page_size);
+    }
+
+    /// Add next contiguous MemoryChunk of memory with size not less than specified.
+    void NO_INLINE addMemoryChunk(size_t min_size)
+    {
+        head = new MemoryChunk(nextSize(min_size + pad_right), head);
+        size_in_bytes += head->size();
+    }
+
+    friend class ArenaAllocator;
+    template <size_t> friend class AlignedArenaAllocator;
+
+public:
+    explicit Arena(size_t initial_size_ = 4096, size_t growth_factor_ = 2, size_t linear_growth_threshold_ = 128 * 1024 * 1024)
+        : growth_factor(growth_factor_), linear_growth_threshold(linear_growth_threshold_),
+        head(new MemoryChunk(initial_size_, nullptr)), size_in_bytes(head->size()),
+        page_size(static_cast<size_t>(::getPageSize()))
+    {
+    }
+
+    ~Arena()
+    {
+        delete head;
+    }
+
+    /// Get piece of memory, without alignment.
+    char * alloc(size_t size)
+    {
+        if (unlikely(head->pos + size > head->end))
+            addMemoryChunk(size);
+
+        char * res = head->pos;
+        head->pos += size;
+        //ASAN_UNPOISON_MEMORY_REGION(res, size + pad_right);
+        return res;
+    }
+
+    /// Get piece of memory with alignment
+    char * alignedAlloc(size_t size, size_t alignment)
+    {
+        do
+        {
+            void * head_pos = head->pos;
+            size_t space = head->end - head->pos;
+
+            auto * res = static_cast<char *>(std::align(alignment, size, head_pos, space));
+            if (res)
+            {
+                head->pos = static_cast<char *>(head_pos);
+                head->pos += size;
+                //ASAN_UNPOISON_MEMORY_REGION(res, size + pad_right);
+                return res;
+            }
+
+            addMemoryChunk(size + alignment);
+        } while (true);
+    }
+
+    template <typename T>
+    T * alloc()
+    {
+        return reinterpret_cast<T *>(alignedAlloc(sizeof(T), alignof(T)));
+    }
+
+    /** Rollback just performed allocation.
+      * Must pass size not more that was just allocated.
+      * Return the resulting head pointer, so that the caller can assert that
+      * the allocation it intended to roll back was indeed the last one.
+      */
+    void * rollback(size_t size)
+    {
+        head->pos -= size;
+        //ASAN_POISON_MEMORY_REGION(head->pos, size + pad_right);
+        return head->pos;
+    }
+
+    /** Begin or expand a contiguous range of memory.
+      * 'range_start' is the start of range. If nullptr, a new range is
+      * allocated.
+      * If there is no space in the current MemoryChunk to expand the range,
+      * the entire range is copied to a new, bigger memory MemoryChunk, and the value
+      * of 'range_start' is updated.
+      * If the optional 'start_alignment' is specified, the start of range is
+      * kept aligned to this value.
+      *
+      * NOTE This method is usable only for the last allocation made on this
+      * Arena. For earlier allocations, see 'realloc' method.
+      */
+    char * allocContinue(size_t additional_bytes, char const *& range_start,
+                         size_t start_alignment = 0)
+    {
+        /*
+         * Allocating zero bytes doesn't make much sense. Also, a zero-sized
+         * range might break the invariant that the range begins at least before
+         * the current MemoryChunk end.
+         */
+        assert(additional_bytes > 0);
+
+        if (!range_start)
+        {
+            // Start a new memory range.
+            char * result = start_alignment
+                ? alignedAlloc(additional_bytes, start_alignment)
+                : alloc(additional_bytes);
+
+            range_start = result;
+            return result;
+        }
+
+        // Extend an existing memory range with 'additional_bytes'.
+
+        // This method only works for extending the last allocation. For lack of
+        // original size, check a weaker condition: that 'begin' is at least in
+        // the current MemoryChunk.
+        assert(range_start >= head->begin);
+        assert(range_start < head->end);
+
+        if (head->pos + additional_bytes <= head->end)
+        {
+            // The new size fits into the last MemoryChunk, so just alloc the
+            // additional size. We can alloc without alignment here, because it
+            // only applies to the start of the range, and we don't change it.
+            return alloc(additional_bytes);
+        }
+
+        // New range doesn't fit into this MemoryChunk, will copy to a new one.
+        //
+        // Note: among other things, this method is used to provide a hack-ish
+        // implementation of realloc over Arenas in ArenaAllocators. It wastes a
+        // lot of memory -- quadratically so when we reach the linear allocation
+        // threshold. This deficiency is intentionally left as is, and should be
+        // solved not by complicating this method, but by rethinking the
+        // approach to memory management for aggregate function states, so that
+        // we can provide a proper realloc().
+        const size_t existing_bytes = head->pos - range_start;
+        const size_t new_bytes = existing_bytes + additional_bytes;
+        const char * old_range = range_start;
+
+        char * new_range = start_alignment
+            ? alignedAlloc(new_bytes, start_alignment)
+            : alloc(new_bytes);
+
+        memcpy(new_range, old_range, existing_bytes);
+
+        range_start = new_range;
+        return new_range + existing_bytes;
+    }
+
+    /// NOTE Old memory region is wasted.
+    char * realloc(const char * old_data, size_t old_size, size_t new_size)
+    {
+        char * res = alloc(new_size);
+        if (old_data)
+        {
+            memcpy(res, old_data, old_size);
+            //ASAN_POISON_MEMORY_REGION(old_data, old_size);
+        }
+        return res;
+    }
+
+    char * alignedRealloc(const char * old_data, size_t old_size, size_t new_size, size_t alignment)
+    {
+        char * res = alignedAlloc(new_size, alignment);
+        if (old_data)
+        {
+            memcpy(res, old_data, old_size);
+            //ASAN_POISON_MEMORY_REGION(old_data, old_size);
+        }
+        return res;
+    }
+
+    /// Insert string without alignment.
+    const char * insert(const char * data, size_t size)
+    {
+        char * res = alloc(size);
+        memcpy(res, data, size);
+        return res;
+    }
+
+    const char * alignedInsert(const char * data, size_t size, size_t alignment)
+    {
+        char * res = alignedAlloc(size, alignment);
+        memcpy(res, data, size);
+        return res;
+    }
+
+    /// Size of MemoryChunks in bytes.
+    size_t size() const
+    {
+        return size_in_bytes;
+    }
+
+    /// Bad method, don't use it -- the MemoryChunks are not your business, the entire
+    /// purpose of the arena code is to manage them for you, so if you find
+    /// yourself having to use this method, probably you're doing something wrong.
+    size_t remainingSpaceInCurrentMemoryChunk() const
+    {
+        return head->remaining();
+    }
+};
+
+using ArenaPtr = std::shared_ptr<Arena>;
+using Arenas = std::vector<ArenaPtr>;
+
+
+}
diff --git a/ydb/library/arrow_clickhouse/Common/CMakeLists.txt b/ydb/library/arrow_clickhouse/Common/CMakeLists.txt
new file mode 100644
index 00000000000..a455b526c0b
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Common/CMakeLists.txt
@@ -0,0 +1,23 @@
+
+# This file was gererated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(library-arrow_clickhouse-Common)
+target_include_directories(library-arrow_clickhouse-Common PRIVATE
+  ${CMAKE_SOURCE_DIR}/ydb/library/yql/udfs/common/clickhouse/client/base
+  ${CMAKE_SOURCE_DIR}/ydb/library/arrow_clickhouse
+)
+target_link_libraries(library-arrow_clickhouse-Common PUBLIC
+  contrib-libs-cxxsupp
+  yutil
+  libs-apache-arrow
+)
+target_sources(library-arrow_clickhouse-Common PRIVATE
+  ${CMAKE_SOURCE_DIR}/ydb/library/arrow_clickhouse/Common/Allocator.cpp
+  ${CMAKE_SOURCE_DIR}/ydb/library/arrow_clickhouse/Common/PODArray.cpp
+)
diff --git a/ydb/library/arrow_clickhouse/Common/HashTable/FixedHashMap.h b/ydb/library/arrow_clickhouse/Common/HashTable/FixedHashMap.h
new file mode 100644
index 00000000000..7358fc0ea82
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Common/HashTable/FixedHashMap.h
@@ -0,0 +1,186 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+
+#include <Common/HashTable/FixedHashTable.h>
+#include <Common/HashTable/HashMap.h>
+
+namespace CH
+{
+
+template <typename Key, typename TMapped, typename TState = HashTableNoState>
+struct FixedHashMapCell
+{
+    using Mapped = TMapped;
+    using State = TState;
+
+    using value_type = PairNoInit<Key, Mapped>;
+    using mapped_type = TMapped;
+
+    bool full;
+    Mapped mapped;
+
+    FixedHashMapCell() {} //-V730
+    FixedHashMapCell(const Key &, const State &) : full(true) {}
+    FixedHashMapCell(const value_type & value_, const State &) : full(true), mapped(value_.second) {}
+
+    const VoidKey getKey() const { return {}; }
+    Mapped & getMapped() { return mapped; }
+    const Mapped & getMapped() const { return mapped; }
+
+    bool isZero(const State &) const { return !full; }
+    void setZero() { full = false; }
+
+    /// Similar to FixedHashSetCell except that we need to contain a pointer to the Mapped field.
+    ///  Note that we have to assemble a continuous layout for the value_type on each call of getValue().
+    struct CellExt
+    {
+        CellExt() {} //-V730
+        CellExt(Key && key_, const FixedHashMapCell * ptr_) : key(key_), ptr(const_cast<FixedHashMapCell *>(ptr_)) {}
+        void update(Key && key_, const FixedHashMapCell * ptr_)
+        {
+            key = key_;
+            ptr = const_cast<FixedHashMapCell *>(ptr_);
+        }
+        Key key;
+        FixedHashMapCell * ptr;
+
+        const Key & getKey() const { return key; }
+        Mapped & getMapped() { return ptr->mapped; }
+        const Mapped & getMapped() const { return ptr->mapped; }
+        const value_type getValue() const { return {key, ptr->mapped}; }
+    };
+};
+
+
+/// In case when we can encode empty cells with zero mapped values.
+template <typename Key, typename TMapped, typename TState = HashTableNoState>
+struct FixedHashMapImplicitZeroCell
+{
+    using Mapped = TMapped;
+    using State = TState;
+
+    using value_type = PairNoInit<Key, Mapped>;
+    using mapped_type = TMapped;
+
+    Mapped mapped;
+
+    FixedHashMapImplicitZeroCell() {}
+    FixedHashMapImplicitZeroCell(const Key &, const State &) {}
+    FixedHashMapImplicitZeroCell(const value_type & value_, const State &) : mapped(value_.second) {}
+
+    const VoidKey getKey() const { return {}; }
+    Mapped & getMapped() { return mapped; }
+    const Mapped & getMapped() const { return mapped; }
+
+    bool isZero(const State &) const { return !mapped; }
+    void setZero() { mapped = {}; }
+
+    /// Similar to FixedHashSetCell except that we need to contain a pointer to the Mapped field.
+    ///  Note that we have to assemble a continuous layout for the value_type on each call of getValue().
+    struct CellExt
+    {
+        CellExt() {} //-V730
+        CellExt(Key && key_, const FixedHashMapImplicitZeroCell * ptr_) : key(key_), ptr(const_cast<FixedHashMapImplicitZeroCell *>(ptr_)) {}
+        void update(Key && key_, const FixedHashMapImplicitZeroCell * ptr_)
+        {
+            key = key_;
+            ptr = const_cast<FixedHashMapImplicitZeroCell *>(ptr_);
+        }
+        Key key;
+        FixedHashMapImplicitZeroCell * ptr;
+
+        const Key & getKey() const { return key; }
+        Mapped & getMapped() { return ptr->mapped; }
+        const Mapped & getMapped() const { return ptr->mapped; }
+        const value_type getValue() const { return {key, ptr->mapped}; }
+    };
+};
+
+
+template <
+    typename Key,
+    typename Mapped,
+    typename Cell = FixedHashMapCell<Key, Mapped>,
+    typename Size = FixedHashTableStoredSize<Cell>,
+    typename Allocator = HashTableAllocator>
+class FixedHashMap : public FixedHashTable<Key, Cell, Size, Allocator>
+{
+public:
+    using Base = FixedHashTable<Key, Cell, Size, Allocator>;
+    using Self = FixedHashMap;
+    using LookupResult = typename Base::LookupResult;
+
+    using Base::Base;
+
+    template <typename Func>
+    void ALWAYS_INLINE mergeToViaEmplace(Self & that, Func && func)
+    {
+        for (auto it = this->begin(), end = this->end(); it != end; ++it)
+        {
+            typename Self::LookupResult res_it;
+            bool inserted;
+            that.emplace(it->getKey(), res_it, inserted, it.getHash());
+            func(res_it->getMapped(), it->getMapped(), inserted);
+        }
+    }
+
+    template <typename Func>
+    void ALWAYS_INLINE mergeToViaFind(Self & that, Func && func)
+    {
+        for (auto it = this->begin(), end = this->end(); it != end; ++it)
+        {
+            auto res_it = that.find(it->getKey(), it.getHash());
+            if (!res_it)
+                func(it->getMapped(), it->getMapped(), false);
+            else
+                func(res_it->getMapped(), it->getMapped(), true);
+        }
+    }
+
+    template <typename Func>
+    void forEachValue(Func && func)
+    {
+        for (auto & v : *this)
+            func(v.getKey(), v.getMapped());
+    }
+
+    template <typename Func>
+    void forEachMapped(Func && func)
+    {
+        for (auto & v : *this)
+            func(v.getMapped());
+    }
+
+    Mapped & ALWAYS_INLINE operator[](const Key & x)
+    {
+        LookupResult it;
+        bool inserted;
+        this->emplace(x, it, inserted);
+        if (inserted)
+            new (&it->getMapped()) Mapped();
+
+        return it->getMapped();
+    }
+};
+
+
+template <typename Key, typename Mapped, typename Allocator = HashTableAllocator>
+using FixedImplicitZeroHashMap = FixedHashMap<
+    Key,
+    Mapped,
+    FixedHashMapImplicitZeroCell<Key, Mapped>,
+    FixedHashTableStoredSize<FixedHashMapImplicitZeroCell<Key, Mapped>>,
+    Allocator>;
+
+template <typename Key, typename Mapped, typename Allocator = HashTableAllocator>
+using FixedImplicitZeroHashMapWithCalculatedSize = FixedHashMap<
+    Key,
+    Mapped,
+    FixedHashMapImplicitZeroCell<Key, Mapped>,
+    FixedHashTableCalculatedSize<FixedHashMapImplicitZeroCell<Key, Mapped>>,
+    Allocator>;
+
+}
diff --git a/ydb/library/arrow_clickhouse/Common/HashTable/FixedHashTable.h b/ydb/library/arrow_clickhouse/Common/HashTable/FixedHashTable.h
new file mode 100644
index 00000000000..2e8d781fc7a
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Common/HashTable/FixedHashTable.h
@@ -0,0 +1,497 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+
+#include <Common/HashTable/HashTable.h>
+
+namespace CH
+{
+
+template <typename Key, typename TState = HashTableNoState>
+struct FixedHashTableCell
+{
+    using State = TState;
+
+    using value_type = Key;
+    using mapped_type = VoidMapped;
+    bool full;
+
+    FixedHashTableCell() {} //-V730
+    FixedHashTableCell(const Key &, const State &) : full(true) {}
+
+    const VoidKey getKey() const { return {}; }
+    VoidMapped getMapped() const { return {}; }
+
+    bool isZero(const State &) const { return !full; }
+    void setZero() { full = false; }
+    static constexpr bool need_zero_value_storage = false;
+
+    /// This Cell is only stored inside an iterator. It's used to accommodate the fact
+    ///  that the iterator based API always provide a reference to a continuous memory
+    ///  containing the Key. As a result, we have to instantiate a real Key field.
+    /// All methods that return a mutable reference to the Key field are named with
+    ///  -Mutable suffix, indicating this is uncommon usage. As this is only for lookup
+    ///  tables, it's totally fine to discard the Key mutations.
+    struct CellExt
+    {
+        Key key;
+
+        const VoidKey getKey() const { return {}; }
+        VoidMapped getMapped() const { return {}; }
+        const value_type & getValue() const { return key; }
+        void update(Key && key_, FixedHashTableCell *) { key = key_; }
+    };
+};
+
+
+/// How to obtain the size of the table.
+
+template <typename Cell>
+struct FixedHashTableStoredSize
+{
+    size_t m_size = 0;
+
+    size_t getSize(const Cell *, const typename Cell::State &, size_t) const { return m_size; }
+    bool isEmpty(const Cell *, const typename Cell::State &, size_t) const { return m_size == 0; }
+
+    void increaseSize() { ++m_size; }
+    void clearSize() { m_size = 0; }
+    void setSize(size_t to) { m_size = to; }
+};
+
+template <typename Cell>
+struct FixedHashTableCalculatedSize
+{
+    size_t getSize(const Cell * buf, const typename Cell::State & state, size_t num_cells) const
+    {
+        size_t res = 0;
+        for (const Cell * end = buf + num_cells; buf != end; ++buf)
+            if (!buf->isZero(state))
+                ++res;
+        return res;
+    }
+
+    bool isEmpty(const Cell * buf, const typename Cell::State & state, size_t num_cells) const
+    {
+        for (const Cell * end = buf + num_cells; buf != end; ++buf)
+            if (!buf->isZero(state))
+                return false;
+        return true;
+    }
+
+    void increaseSize() {}
+    void clearSize() {}
+    void setSize(size_t) {}
+};
+
+
+/** Used as a lookup table for small keys such as UInt8, UInt16. It's different
+  *  than a HashTable in that keys are not stored in the Cell buf, but inferred
+  *  inside each iterator. There are a bunch of to make it faster than using
+  *  HashTable: a) It doesn't have a conflict chain; b) There is no key
+  *  comparison; c) The number of cycles for checking cell empty is halved; d)
+  *  Memory layout is tighter, especially the Clearable variants.
+  *
+  * NOTE: For Set variants this should always be better. For Map variants
+  *  however, as we need to assemble the real cell inside each iterator, there
+  *  might be some cases we fall short.
+  *
+  * TODO: Deprecate the cell API so that end users don't rely on the structure
+  *  of cell. Instead iterator should be used for operations such as cell
+  *  transfer, key updates (f.g. StringRef) and serde. This will allow
+  *  TwoLevelHashSet(Map) to contain different type of sets(maps).
+  */
+template <typename Key, typename Cell, typename Size, typename Allocator>
+class FixedHashTable : /*private boost::noncopyable,*/ protected Allocator, protected Cell::State, protected Size
+{
+    static constexpr size_t NUM_CELLS = 1ULL << (sizeof(Key) * 8);
+
+protected:
+    friend class const_iterator;
+    friend class iterator;
+    friend class Reader;
+
+    using Self = FixedHashTable;
+
+    Cell * buf; /// A piece of memory for all elements.
+
+    void alloc() { buf = reinterpret_cast<Cell *>(Allocator::alloc(NUM_CELLS * sizeof(Cell))); }
+
+    void free()
+    {
+        if (buf)
+        {
+            Allocator::free(buf, getBufferSizeInBytes());
+            buf = nullptr;
+        }
+    }
+
+    void destroyElements()
+    {
+        if (!std::is_trivially_destructible_v<Cell>)
+            for (iterator it = begin(), it_end = end(); it != it_end; ++it)
+                it.ptr->~Cell();
+    }
+
+
+    template <typename Derived, bool is_const>
+    class iterator_base
+    {
+        using Container = std::conditional_t<is_const, const Self, Self>;
+        using cell_type = std::conditional_t<is_const, const Cell, Cell>;
+
+        Container * container;
+        cell_type * ptr;
+
+        friend class FixedHashTable;
+
+    public:
+        iterator_base() {}
+        iterator_base(Container * container_, cell_type * ptr_) : container(container_), ptr(ptr_)
+        {
+            cell.update(ptr - container->buf, ptr);
+        }
+
+        bool operator==(const iterator_base & rhs) const { return ptr == rhs.ptr; }
+        bool operator!=(const iterator_base & rhs) const { return ptr != rhs.ptr; }
+
+        Derived & operator++()
+        {
+            ++ptr;
+
+            /// Skip empty cells in the main buffer.
+            auto buf_end = container->buf + container->NUM_CELLS;
+            while (ptr < buf_end && ptr->isZero(*container))
+                ++ptr;
+
+            return static_cast<Derived &>(*this);
+        }
+
+        auto & operator*()
+        {
+            if (cell.key != ptr - container->buf)
+                cell.update(ptr - container->buf, ptr);
+            return cell;
+        }
+        auto * operator-> ()
+        {
+            if (cell.key != ptr - container->buf)
+                cell.update(ptr - container->buf, ptr);
+            return &cell;
+        }
+
+        auto getPtr() const { return ptr; }
+        size_t getHash() const { return ptr - container->buf; }
+        size_t getCollisionChainLength() const { return 0; }
+        typename cell_type::CellExt cell;
+    };
+
+
+public:
+    using key_type = Key;
+    using mapped_type = typename Cell::mapped_type;
+    using value_type = typename Cell::value_type;
+    using cell_type = Cell;
+
+    using LookupResult = Cell *;
+    using ConstLookupResult = const Cell *;
+
+
+    size_t hash(const Key & x) const { return x; }
+
+    FixedHashTable() { alloc(); }
+
+    FixedHashTable(FixedHashTable && rhs) : buf(nullptr) { *this = std::move(rhs); }
+
+    ~FixedHashTable()
+    {
+        destroyElements();
+        free();
+    }
+
+    FixedHashTable & operator=(FixedHashTable && rhs)
+    {
+        destroyElements();
+        free();
+
+        std::swap(buf, rhs.buf);
+        this->setSize(rhs.size());
+
+        Allocator::operator=(std::move(rhs));
+        Cell::State::operator=(std::move(rhs));
+
+        return *this;
+    }
+#if 0
+    class Reader final : private Cell::State
+    {
+    public:
+        Reader(DB::ReadBuffer & in_) : in(in_) {}
+
+        Reader(const Reader &) = delete;
+        Reader & operator=(const Reader &) = delete;
+
+        bool next()
+        {
+            if (!is_initialized)
+            {
+                Cell::State::read(in);
+                DB::readVarUInt(size, in);
+                is_initialized = true;
+            }
+
+            if (read_count == size)
+            {
+                is_eof = true;
+                return false;
+            }
+
+            cell.read(in);
+            ++read_count;
+
+            return true;
+        }
+
+        inline const value_type & get() const
+        {
+            if (!is_initialized || is_eof)
+                throw CH::Exception("No available data");
+
+            return cell.getValue();
+        }
+
+    private:
+        DB::ReadBuffer & in;
+        Cell cell;
+        size_t read_count = 0;
+        size_t size = 0;
+        bool is_eof = false;
+        bool is_initialized = false;
+    };
+#endif
+
+    class iterator : public iterator_base<iterator, false>
+    {
+    public:
+        using iterator_base<iterator, false>::iterator_base;
+    };
+
+    class const_iterator : public iterator_base<const_iterator, true>
+    {
+    public:
+        using iterator_base<const_iterator, true>::iterator_base;
+    };
+
+
+    const_iterator begin() const
+    {
+        if (!buf)
+            return end();
+
+        const Cell * ptr = buf;
+        auto buf_end = buf + NUM_CELLS;
+        while (ptr < buf_end && ptr->isZero(*this))
+            ++ptr;
+
+        return const_iterator(this, ptr);
+    }
+
+    const_iterator cbegin() const { return begin(); }
+
+    iterator begin()
+    {
+        if (!buf)
+            return end();
+
+        Cell * ptr = buf;
+        auto buf_end = buf + NUM_CELLS;
+        while (ptr < buf_end && ptr->isZero(*this))
+            ++ptr;
+
+        return iterator(this, ptr);
+    }
+
+    const_iterator end() const
+    {
+        /// Avoid UBSan warning about adding zero to nullptr. It is valid in C++20 (and earlier) but not valid in C.
+        return const_iterator(this, buf ? buf + NUM_CELLS : buf);
+    }
+
+    const_iterator cend() const
+    {
+        return end();
+    }
+
+    iterator end()
+    {
+        return iterator(this, buf ? buf + NUM_CELLS : buf);
+    }
+
+
+public:
+    /// The last parameter is unused but exists for compatibility with HashTable interface.
+    void ALWAYS_INLINE emplace(const Key & x, LookupResult & it, bool & inserted, size_t /* hash */ = 0)
+    {
+        it = &buf[x];
+
+        if (!buf[x].isZero(*this))
+        {
+            inserted = false;
+            return;
+        }
+
+        new (&buf[x]) Cell(x, *this);
+        inserted = true;
+        this->increaseSize();
+    }
+
+    std::pair<LookupResult, bool> ALWAYS_INLINE insert(const value_type & x)
+    {
+        std::pair<LookupResult, bool> res;
+        emplace(Cell::getKey(x), res.first, res.second);
+        if (res.second)
+            insertSetMapped(res.first->getMapped(), x);
+
+        return res;
+    }
+
+    LookupResult ALWAYS_INLINE find(const Key & x) { return !buf[x].isZero(*this) ? &buf[x] : nullptr; }
+
+    ConstLookupResult ALWAYS_INLINE find(const Key & x) const { return const_cast<std::decay_t<decltype(*this)> *>(this)->find(x); }
+
+    LookupResult ALWAYS_INLINE find(const Key &, size_t hash_value) { return !buf[hash_value].isZero(*this) ? &buf[hash_value] : nullptr; }
+
+    ConstLookupResult ALWAYS_INLINE find(const Key & key, size_t hash_value) const
+    {
+        return const_cast<std::decay_t<decltype(*this)> *>(this)->find(key, hash_value);
+    }
+
+    bool ALWAYS_INLINE has(const Key & x) const { return !buf[x].isZero(*this); }
+    bool ALWAYS_INLINE has(const Key &, size_t hash_value) const { return !buf[hash_value].isZero(*this); }
+#if 0
+    void write(DB::WriteBuffer & wb) const
+    {
+        Cell::State::write(wb);
+        DB::writeVarUInt(size(), wb);
+
+        if (!buf)
+            return;
+
+        for (auto ptr = buf, buf_end = buf + NUM_CELLS; ptr < buf_end; ++ptr)
+        {
+            if (!ptr->isZero(*this))
+            {
+                DB::writeVarUInt(ptr - buf);
+                ptr->write(wb);
+            }
+        }
+    }
+
+    void writeText(DB::WriteBuffer & wb) const
+    {
+        Cell::State::writeText(wb);
+        DB::writeText(size(), wb);
+
+        if (!buf)
+            return;
+
+        for (auto ptr = buf, buf_end = buf + NUM_CELLS; ptr < buf_end; ++ptr)
+        {
+            if (!ptr->isZero(*this))
+            {
+                DB::writeChar(',', wb);
+                DB::writeText(ptr - buf, wb);
+                DB::writeChar(',', wb);
+                ptr->writeText(wb);
+            }
+        }
+    }
+
+    void read(DB::ReadBuffer & rb)
+    {
+        Cell::State::read(rb);
+        destroyElements();
+        size_t m_size;
+        DB::readVarUInt(m_size, rb);
+        this->setSize(m_size);
+        free();
+        alloc();
+
+        for (size_t i = 0; i < m_size; ++i)
+        {
+            size_t place_value = 0;
+            DB::readVarUInt(place_value, rb);
+            Cell x;
+            x.read(rb);
+            new (&buf[place_value]) Cell(x, *this);
+        }
+    }
+
+    void readText(DB::ReadBuffer & rb)
+    {
+        Cell::State::readText(rb);
+        destroyElements();
+        size_t m_size;
+        DB::readText(m_size, rb);
+        this->setSize(m_size);
+        free();
+        alloc();
+
+        for (size_t i = 0; i < m_size; ++i)
+        {
+            size_t place_value = 0;
+            DB::assertChar(',', rb);
+            DB::readText(place_value, rb);
+            Cell x;
+            DB::assertChar(',', rb);
+            x.readText(rb);
+            new (&buf[place_value]) Cell(x, *this);
+        }
+    }
+#endif
+    size_t size() const { return this->getSize(buf, *this, NUM_CELLS); }
+    bool empty() const { return this->isEmpty(buf, *this, NUM_CELLS); }
+
+    void clear()
+    {
+        destroyElements();
+        this->clearSize();
+
+        memset(static_cast<void *>(buf), 0, NUM_CELLS * sizeof(*buf));
+    }
+
+    /// After executing this function, the table can only be destroyed,
+    ///  and also you can use the methods `size`, `empty`, `begin`, `end`.
+    void clearAndShrink()
+    {
+        destroyElements();
+        this->clearSize();
+        free();
+    }
+
+    size_t getBufferSizeInBytes() const { return NUM_CELLS * sizeof(Cell); }
+
+    size_t getBufferSizeInCells() const { return NUM_CELLS; }
+
+    /// Return offset for result in internal buffer.
+    /// Result can have value up to `getBufferSizeInCells() + 1`
+    /// because offset for zero value considered to be 0
+    /// and for other values it will be `offset in buffer + 1`
+    size_t offsetInternal(ConstLookupResult ptr) const
+    {
+        if (ptr->isZero(*this))
+            return 0;
+        return ptr - buf + 1;
+    }
+
+    const Cell * data() const { return buf; }
+    Cell * data() { return buf; }
+
+#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
+    size_t getCollisions() const { return 0; }
+#endif
+};
+
+}
diff --git a/ydb/library/arrow_clickhouse/Common/HashTable/Hash.h b/ydb/library/arrow_clickhouse/Common/HashTable/Hash.h
new file mode 100644
index 00000000000..89aaeb48a48
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Common/HashTable/Hash.h
@@ -0,0 +1,417 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+
+#include <util/digest/city.h>
+#include <common/types.h>
+#include <common/unaligned.h>
+#include <common/StringRef.h>
+
+#include <type_traits>
+
+namespace CH
+{
+
+/** Hash functions that are better than the trivial function std::hash.
+  *
+  * Example: when we do aggregation by the visitor ID, the performance increase is more than 5 times.
+  * This is because of following reasons:
+  * - in Yandex, visitor identifier is an integer that has timestamp with seconds resolution in lower bits;
+  * - in typical implementation of standard library, hash function for integers is trivial and just use lower bits;
+  * - traffic is non-uniformly distributed across a day;
+  * - we are using open-addressing linear probing hash tables that are most critical to hash function quality,
+  *   and trivial hash function gives disastrous results.
+  */
+
+/** Taken from MurmurHash. This is Murmur finalizer.
+  * Faster than intHash32 when inserting into the hash table UInt64 -> UInt64, where the key is the visitor ID.
+  */
+inline UInt64 intHash64(UInt64 x)
+{
+    x ^= x >> 33;
+    x *= 0xff51afd7ed558ccdULL;
+    x ^= x >> 33;
+    x *= 0xc4ceb9fe1a85ec53ULL;
+    x ^= x >> 33;
+
+    return x;
+}
+
+/** CRC32C is not very high-quality as a hash function,
+  *  according to avalanche and bit independence tests (see SMHasher software), as well as a small number of bits,
+  *  but can behave well when used in hash tables,
+  *  due to high speed (latency 3 + 1 clock cycle, throughput 1 clock cycle).
+  * Works only with SSE 4.2 support.
+  */
+#ifdef __SSE4_2__
+#include <nmmintrin.h>
+#endif
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+#include <arm_acle.h>
+#include <arm_neon.h>
+#endif
+
+inline UInt64 intHashCRC32(UInt64 x)
+{
+#ifdef __SSE4_2__
+    return _mm_crc32_u64(-1ULL, x);
+#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    return __crc32cd(-1U, x);
+#else
+    /// On other platforms we do not have CRC32. NOTE This can be confusing.
+    return intHash64(x);
+#endif
+}
+
+inline UInt64 intHashCRC32(UInt64 x, UInt64 updated_value)
+{
+#ifdef __SSE4_2__
+    return _mm_crc32_u64(updated_value, x);
+#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    return  __crc32cd(updated_value, x);
+#else
+    /// On other platforms we do not have CRC32. NOTE This can be confusing.
+    return intHash64(x) ^ updated_value;
+#endif
+}
+
+template <typename T>
+inline typename std::enable_if<(sizeof(T) > sizeof(UInt64)), UInt64>::type
+intHashCRC32(const T & x, UInt64 updated_value)
+{
+    auto * begin = reinterpret_cast<const char *>(&x);
+    for (size_t i = 0; i < sizeof(T); i += sizeof(UInt64))
+    {
+        updated_value = intHashCRC32(unalignedLoad<UInt64>(begin), updated_value);
+        begin += sizeof(UInt64);
+    }
+
+    return updated_value;
+}
+
+
+inline UInt32 updateWeakHash32(const UInt8 * pos, size_t size, UInt32 updated_value)
+{
+    if (size < 8)
+    {
+        UInt64 value = 0;
+
+        switch (size)
+        {
+            case 0:
+                break;
+            case 1:
+                __builtin_memcpy(&value, pos, 1);
+                break;
+            case 2:
+                __builtin_memcpy(&value, pos, 2);
+                break;
+            case 3:
+                __builtin_memcpy(&value, pos, 3);
+                break;
+            case 4:
+                __builtin_memcpy(&value, pos, 4);
+                break;
+            case 5:
+                __builtin_memcpy(&value, pos, 5);
+                break;
+            case 6:
+                __builtin_memcpy(&value, pos, 6);
+                break;
+            case 7:
+                __builtin_memcpy(&value, pos, 7);
+                break;
+            default:
+                __builtin_unreachable();
+        }
+
+        reinterpret_cast<unsigned char *>(&value)[7] = size;
+        return intHashCRC32(value, updated_value);
+    }
+
+    const auto * end = pos + size;
+    while (pos + 8 <= end)
+    {
+        auto word = unalignedLoad<UInt64>(pos);
+        updated_value = intHashCRC32(word, updated_value);
+
+        pos += 8;
+    }
+
+    if (pos < end)
+    {
+        /// If string size is not divisible by 8.
+        /// Lets' assume the string was 'abcdefghXYZ', so it's tail is 'XYZ'.
+        UInt8 tail_size = end - pos;
+        /// Load tailing 8 bytes. Word is 'defghXYZ'.
+        auto word = unalignedLoad<UInt64>(end - 8);
+        /// Prepare mask which will set other 5 bytes to 0. It is 0xFFFFFFFFFFFFFFFF << 5 = 0xFFFFFF0000000000.
+        /// word & mask = '\0\0\0\0\0XYZ' (bytes are reversed because of little ending)
+        word &= (~UInt64(0)) << UInt8(8 * (8 - tail_size));
+        /// Use least byte to store tail length.
+        word |= tail_size;
+        /// Now word is '\3\0\0\0\0XYZ'
+        updated_value = intHashCRC32(word, updated_value);
+    }
+
+    return updated_value;
+}
+
+template <typename T>
+inline size_t DefaultHash64(std::enable_if_t<(sizeof(T) <= sizeof(UInt64)), T> key)
+{
+    union
+    {
+        T in;
+        UInt64 out;
+    } u;
+    u.out = 0;
+    u.in = key;
+    return intHash64(u.out);
+}
+
+
+template <typename T>
+inline size_t DefaultHash64(std::enable_if_t<(sizeof(T) > sizeof(UInt64)), T> key)
+{
+    if constexpr (is_big_int_v<T> && sizeof(T) == 16)
+    {
+        /// TODO This is classical antipattern.
+        return intHash64(
+            static_cast<UInt64>(key) ^
+            static_cast<UInt64>(key >> 64));
+    }
+    else if constexpr (is_big_int_v<T> && sizeof(T) == 32)
+    {
+        return intHash64(
+            static_cast<UInt64>(key) ^
+            static_cast<UInt64>(key >> 64) ^
+            static_cast<UInt64>(key >> 128) ^
+            static_cast<UInt64>(key >> 256));
+    }
+    assert(false);
+    __builtin_unreachable();
+}
+
+
+template <typename T>
+struct DefaultHash
+{
+    size_t operator() (T key) const
+    {
+        return DefaultHash64<T>(key);
+    }
+};
+
+template <typename T> struct HashCRC32;
+
+template <typename T>
+inline size_t hashCRC32(std::enable_if_t<(sizeof(T) <= sizeof(UInt64)), T> key)
+{
+    union
+    {
+        T in;
+        UInt64 out;
+    } u;
+    u.out = 0;
+    u.in = key;
+    return intHashCRC32(u.out);
+}
+
+template <typename T>
+inline size_t hashCRC32(std::enable_if_t<(sizeof(T) > sizeof(UInt64)), T> key)
+{
+    return intHashCRC32(key, -1);
+}
+
+#define DEFINE_HASH(T) \
+template <> struct HashCRC32<T>\
+{\
+    size_t operator() (T key) const\
+    {\
+        return hashCRC32<T>(key);\
+    }\
+};
+
+DEFINE_HASH(UInt8)
+DEFINE_HASH(UInt16)
+DEFINE_HASH(UInt32)
+DEFINE_HASH(UInt64)
+DEFINE_HASH(UInt128)
+DEFINE_HASH(UInt256)
+DEFINE_HASH(Int8)
+DEFINE_HASH(Int16)
+DEFINE_HASH(Int32)
+DEFINE_HASH(Int64)
+DEFINE_HASH(Int128)
+DEFINE_HASH(Int256)
+DEFINE_HASH(float)
+DEFINE_HASH(double)
+
+#undef DEFINE_HASH
+
+
+struct UInt128Hash
+{
+    size_t operator()(UInt128 x) const
+    {
+        return ::Hash128to64({x.items[0], x.items[1]});
+    }
+};
+#if 0
+struct UUIDHash
+{
+    size_t operator()(DB::UUID x) const
+    {
+        return UInt128Hash()(x.toUnderType());
+    }
+};
+#endif
+#ifdef __SSE4_2__
+
+struct UInt128HashCRC32
+{
+    size_t operator()(UInt128 x) const
+    {
+        UInt64 crc = -1ULL;
+        crc = _mm_crc32_u64(crc, x.items[0]);
+        crc = _mm_crc32_u64(crc, x.items[1]);
+        return crc;
+    }
+};
+
+#else
+
+/// On other platforms we do not use CRC32. NOTE This can be confusing.
+struct UInt128HashCRC32 : public UInt128Hash {};
+
+#endif
+
+struct UInt128TrivialHash
+{
+    size_t operator()(UInt128 x) const { return x.items[0]; }
+};
+#if 0
+struct UUIDTrivialHash
+{
+    size_t operator()(DB::UUID x) const { return x.toUnderType().items[0]; }
+};
+#endif
+struct UInt256Hash
+{
+    size_t operator()(UInt256 x) const
+    {
+        /// NOTE suboptimal
+        return ::Hash128to64({
+            ::Hash128to64({x.items[0], x.items[1]}),
+            ::Hash128to64({x.items[2], x.items[3]})});
+    }
+};
+
+#ifdef __SSE4_2__
+
+struct UInt256HashCRC32
+{
+    size_t operator()(UInt256 x) const
+    {
+        UInt64 crc = -1ULL;
+        crc = _mm_crc32_u64(crc, x.items[0]);
+        crc = _mm_crc32_u64(crc, x.items[1]);
+        crc = _mm_crc32_u64(crc, x.items[2]);
+        crc = _mm_crc32_u64(crc, x.items[3]);
+        return crc;
+    }
+};
+
+#else
+
+/// We do not need to use CRC32 on other platforms. NOTE This can be confusing.
+struct UInt256HashCRC32 : public UInt256Hash {};
+
+#endif
+
+template <>
+struct DefaultHash<UInt128> : public UInt128Hash {};
+
+template <>
+struct DefaultHash<UInt256> : public UInt256Hash {};
+#if 0
+template <>
+struct DefaultHash<DB::UUID> : public UUIDHash {};
+#endif
+
+/// It is reasonable to use for UInt8, UInt16 with sufficient hash table size.
+struct TrivialHash
+{
+    template <typename T>
+    size_t operator() (T key) const
+    {
+        return key;
+    }
+};
+
+
+/** A relatively good non-cryptographic hash function from UInt64 to UInt32.
+  * But worse (both in quality and speed) than just cutting intHash64.
+  * Taken from here: http://www.concentric.net/~ttwang/tech/inthash.htm
+  *
+  * Slightly changed compared to the function by link: shifts to the right are accidentally replaced by a cyclic shift to the right.
+  * This change did not affect the smhasher test results.
+  *
+  * It is recommended to use different salt for different tasks.
+  * That was the case that in the database values were sorted by hash (for low-quality pseudo-random spread),
+  *  and in another place, in the aggregate function, the same hash was used in the hash table,
+  *  as a result, this aggregate function was monstrously slowed due to collisions.
+  *
+  * NOTE Salting is far from perfect, because it commutes with first steps of calculation.
+  *
+  * NOTE As mentioned, this function is slower than intHash64.
+  * But occasionally, it is faster, when written in a loop and loop is vectorized.
+  */
+template <UInt64 salt>
+inline UInt32 intHash32(UInt64 key)
+{
+    key ^= salt;
+
+    key = (~key) + (key << 18);
+    key = key ^ ((key >> 31) | (key << 33));
+    key = key * 21;
+    key = key ^ ((key >> 11) | (key << 53));
+    key = key + (key << 6);
+    key = key ^ ((key >> 22) | (key << 42));
+
+    return key;
+}
+
+
+/// For containers.
+template <typename T, UInt64 salt = 0>
+struct IntHash32
+{
+    size_t operator() (const T & key) const
+    {
+        if constexpr (is_big_int_v<T> && sizeof(T) == 16)
+        {
+            return intHash32<salt>(key.items[0] ^ key.items[1]);
+        }
+        else if constexpr (is_big_int_v<T> && sizeof(T) == 32)
+        {
+            return intHash32<salt>(key.items[0] ^ key.items[1] ^ key.items[2] ^ key.items[3]);
+        }
+        else if constexpr (sizeof(T) <= sizeof(UInt64))
+        {
+            return intHash32<salt>(key);
+        }
+
+        assert(false);
+        __builtin_unreachable();
+    }
+};
+
+template <>
+struct DefaultHash<CH::StringRef> : public CH::StringRefHash {};
+
+}
diff --git a/ydb/library/arrow_clickhouse/Common/HashTable/HashMap.h b/ydb/library/arrow_clickhouse/Common/HashTable/HashMap.h
new file mode 100644
index 00000000000..a76cd5b353d
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Common/HashTable/HashMap.h
@@ -0,0 +1,314 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+
+#include <Common/HashTable/Hash.h>
+#include <Common/HashTable/HashTable.h>
+#include <Common/HashTable/HashTableAllocator.h>
+
+
+/** NOTE HashMap could only be used for memmoveable (position independent) types.
+  * Example: std::string is not position independent in libstdc++ with C++11 ABI or in libc++.
+  * Also, key in hash table must be of type, that zero bytes is compared equals to zero key.
+  */
+
+namespace CH
+{
+
+struct NoInitTag
+{
+};
+
+/// A pair that does not initialize the elements, if not needed.
+template <typename First, typename Second>
+struct PairNoInit
+{
+    First first;
+    Second second;
+
+    PairNoInit() {} /// NOLINT
+
+    template <typename FirstValue>
+    PairNoInit(FirstValue && first_, NoInitTag)
+        : first(std::forward<FirstValue>(first_))
+    {
+    }
+
+    template <typename FirstValue, typename SecondValue>
+    PairNoInit(FirstValue && first_, SecondValue && second_)
+        : first(std::forward<FirstValue>(first_))
+        , second(std::forward<SecondValue>(second_))
+    {
+    }
+};
+
+template <typename First, typename Second>
+PairNoInit<std::decay_t<First>, std::decay_t<Second>> makePairNoInit(First && first, Second && second)
+{
+    return PairNoInit<std::decay_t<First>, std::decay_t<Second>>(std::forward<First>(first), std::forward<Second>(second));
+}
+
+
+template <typename Key, typename TMapped, typename Hash, typename TState = HashTableNoState>
+struct HashMapCell
+{
+    using Mapped = TMapped;
+    using State = TState;
+
+    using value_type = PairNoInit<Key, Mapped>;
+    using mapped_type = Mapped;
+    using key_type = Key;
+
+    value_type value;
+
+    HashMapCell() = default;
+    HashMapCell(const Key & key_, const State &) : value(key_, NoInitTag()) {}
+    HashMapCell(const value_type & value_, const State &) : value(value_) {}
+
+    /// Get the key (externally).
+    const Key & getKey() const { return value.first; }
+    Mapped & getMapped() { return value.second; }
+    const Mapped & getMapped() const { return value.second; }
+    const value_type & getValue() const { return value; }
+
+    /// Get the key (internally).
+    static const Key & getKey(const value_type & value) { return value.first; }
+
+    bool keyEquals(const Key & key_) const { return bitEquals(value.first, key_); }
+    bool keyEquals(const Key & key_, size_t /*hash_*/) const { return bitEquals(value.first, key_); }
+    bool keyEquals(const Key & key_, size_t /*hash_*/, const State & /*state*/) const { return bitEquals(value.first, key_); }
+
+    void setHash(size_t /*hash_value*/) {}
+    size_t getHash(const Hash & hash) const { return hash(value.first); }
+
+    bool isZero(const State & state) const { return isZero(value.first, state); }
+    static bool isZero(const Key & key, const State & /*state*/) { return ZeroTraits::check(key); }
+
+    /// Set the key value to zero.
+    void setZero() { ZeroTraits::set(value.first); }
+
+    /// Do I need to store the zero key separately (that is, can a zero key be inserted into the hash table).
+    static constexpr bool need_zero_value_storage = true;
+
+    void setMapped(const value_type & value_) { value.second = value_.second; }
+#if 0
+    /// Serialization, in binary and text form.
+    void write(DB::WriteBuffer & wb) const
+    {
+        DB::writeBinary(value.first, wb);
+        DB::writeBinary(value.second, wb);
+    }
+
+    void writeText(DB::WriteBuffer & wb) const
+    {
+        DB::writeDoubleQuoted(value.first, wb);
+        DB::writeChar(',', wb);
+        DB::writeDoubleQuoted(value.second, wb);
+    }
+
+    /// Deserialization, in binary and text form.
+    void read(DB::ReadBuffer & rb)
+    {
+        DB::readBinary(value.first, rb);
+        DB::readBinary(value.second, rb);
+    }
+
+    void readText(DB::ReadBuffer & rb)
+    {
+        DB::readDoubleQuoted(value.first, rb);
+        DB::assertChar(',', rb);
+        DB::readDoubleQuoted(value.second, rb);
+    }
+#endif
+    static bool constexpr need_to_notify_cell_during_move = false;
+
+    static void move(HashMapCell * /* old_location */, HashMapCell * /* new_location */) {}
+
+    template <size_t I>
+    auto & get() & {
+        if constexpr (I == 0) return value.first;
+        else if constexpr (I == 1) return value.second;
+    }
+
+    template <size_t I>
+    auto const & get() const & {
+        if constexpr (I == 0) return value.first;
+        else if constexpr (I == 1) return value.second;
+    }
+
+    template <size_t I>
+    auto && get() && {
+        if constexpr (I == 0) return std::move(value.first);
+        else if constexpr (I == 1) return std::move(value.second);
+    }
+
+};
+
+template <typename Key, typename TMapped, typename Hash, typename TState = HashTableNoState>
+struct HashMapCellWithSavedHash : public HashMapCell<Key, TMapped, Hash, TState>
+{
+    using Base = HashMapCell<Key, TMapped, Hash, TState>;
+
+    size_t saved_hash;
+
+    using Base::Base;
+
+    bool keyEquals(const Key & key_) const { return bitEquals(this->value.first, key_); }
+    bool keyEquals(const Key & key_, size_t hash_) const { return saved_hash == hash_ && bitEquals(this->value.first, key_); }
+    bool keyEquals(const Key & key_, size_t hash_, const typename Base::State &) const { return keyEquals(key_, hash_); }
+
+    void setHash(size_t hash_value) { saved_hash = hash_value; }
+    size_t getHash(const Hash & /*hash_function*/) const { return saved_hash; }
+};
+
+template <
+    typename Key,
+    typename Cell,
+    typename Hash = DefaultHash<Key>,
+    typename Grower = HashTableGrower<>,
+    typename Allocator = HashTableAllocator>
+class HashMapTable : public HashTable<Key, Cell, Hash, Grower, Allocator>
+{
+public:
+    using Self = HashMapTable;
+    using Base = HashTable<Key, Cell, Hash, Grower, Allocator>;
+    using LookupResult = typename Base::LookupResult;
+
+    using Base::Base;
+
+    /// Merge every cell's value of current map into the destination map via emplace.
+    ///  Func should have signature void(Mapped & dst, Mapped & src, bool emplaced).
+    ///  Each filled cell in current map will invoke func once. If that map doesn't
+    ///  have a key equals to the given cell, a new cell gets emplaced into that map,
+    ///  and func is invoked with the third argument emplaced set to true. Otherwise
+    ///  emplaced is set to false.
+    template <typename Func>
+    void ALWAYS_INLINE mergeToViaEmplace(Self & that, Func && func)
+    {
+        for (auto it = this->begin(), end = this->end(); it != end; ++it)
+        {
+            typename Self::LookupResult res_it;
+            bool inserted;
+            that.emplace(Cell::getKey(it->getValue()), res_it, inserted, it.getHash());
+            func(res_it->getMapped(), it->getMapped(), inserted);
+        }
+    }
+
+    /// Merge every cell's value of current map into the destination map via find.
+    ///  Func should have signature void(Mapped & dst, Mapped & src, bool exist).
+    ///  Each filled cell in current map will invoke func once. If that map doesn't
+    ///  have a key equals to the given cell, func is invoked with the third argument
+    ///  exist set to false. Otherwise exist is set to true.
+    template <typename Func>
+    void ALWAYS_INLINE mergeToViaFind(Self & that, Func && func)
+    {
+        for (auto it = this->begin(), end = this->end(); it != end; ++it)
+        {
+            auto res_it = that.find(Cell::getKey(it->getValue()), it.getHash());
+            if (!res_it)
+                func(it->getMapped(), it->getMapped(), false);
+            else
+                func(res_it->getMapped(), it->getMapped(), true);
+        }
+    }
+
+    /// Call func(const Key &, Mapped &) for each hash map element.
+    template <typename Func>
+    void forEachValue(Func && func)
+    {
+        for (auto & v : *this)
+            func(v.getKey(), v.getMapped());
+    }
+
+    /// Call func(Mapped &) for each hash map element.
+    template <typename Func>
+    void forEachMapped(Func && func)
+    {
+        for (auto & v : *this)
+            func(v.getMapped());
+    }
+
+    typename Cell::Mapped & ALWAYS_INLINE operator[](const Key & x)
+    {
+        LookupResult it;
+        bool inserted;
+        this->emplace(x, it, inserted);
+
+        /** It may seem that initialization is not necessary for POD-types (or __has_trivial_constructor),
+          *  since the hash table memory is initially initialized with zeros.
+          * But, in fact, an empty cell may not be initialized with zeros in the following cases:
+          * - ZeroValueStorage (it only zeros the key);
+          * - after resizing and moving a part of the cells to the new half of the hash table, the old cells also have only the key to zero.
+          *
+          * On performance, there is almost always no difference, due to the fact that it->second is usually assigned immediately
+          *  after calling `operator[]`, and since `operator[]` is inlined, the compiler removes unnecessary initialization.
+          *
+          * Sometimes due to initialization, the performance even grows. This occurs in code like `++map[key]`.
+          * When we do the initialization, for new cells, it's enough to make `store 1` right away.
+          * And if we did not initialize, then even though there was zero in the cell,
+          *  the compiler can not guess about this, and generates the `load`, `increment`, `store` code.
+          */
+        if (inserted)
+            new (&it->getMapped()) typename Cell::Mapped();
+
+        return it->getMapped();
+    }
+};
+
+template <
+    typename Key,
+    typename Mapped,
+    typename Hash = DefaultHash<Key>,
+    typename Grower = HashTableGrower<>,
+    typename Allocator = HashTableAllocator>
+using HashMap = HashMapTable<Key, HashMapCell<Key, Mapped, Hash>, Hash, Grower, Allocator>;
+
+
+template <
+    typename Key,
+    typename Mapped,
+    typename Hash = DefaultHash<Key>,
+    typename Grower = HashTableGrower<>,
+    typename Allocator = HashTableAllocator>
+using HashMapWithSavedHash = HashMapTable<Key, HashMapCellWithSavedHash<Key, Mapped, Hash>, Hash, Grower, Allocator>;
+
+template <typename Key, typename Mapped, typename Hash,
+    size_t initial_size_degree>
+using HashMapWithStackMemory = HashMapTable<
+    Key,
+    HashMapCellWithSavedHash<Key, Mapped, Hash>,
+    Hash,
+    HashTableGrower<initial_size_degree>,
+    HashTableAllocatorWithStackMemory<
+        (1ULL << initial_size_degree)
+        * sizeof(HashMapCellWithSavedHash<Key, Mapped, Hash>)>>;
+
+}
+
+namespace std
+{
+
+    template <typename Key, typename TMapped, typename Hash, typename TState>
+    struct tuple_size<CH::HashMapCell<Key, TMapped, Hash, TState>> : std::integral_constant<size_t, 2> { };
+
+    template <typename Key, typename TMapped, typename Hash, typename TState>
+    struct tuple_element<0, CH::HashMapCell<Key, TMapped, Hash, TState>> { using type = Key; };
+
+    template <typename Key, typename TMapped, typename Hash, typename TState>
+    struct tuple_element<1, CH::HashMapCell<Key, TMapped, Hash, TState>> { using type = TMapped; };
+}
+
+namespace std
+{
+
+    template <typename Key, typename TMapped, typename Hash, typename TState>
+    struct tuple_size<CH::HashMapCellWithSavedHash<Key, TMapped, Hash, TState>> : std::integral_constant<size_t, 2> { };
+
+    template <typename Key, typename TMapped, typename Hash, typename TState>
+    struct tuple_element<0, CH::HashMapCellWithSavedHash<Key, TMapped, Hash, TState>> { using type = Key; };
+
+    template <typename Key, typename TMapped, typename Hash, typename TState>
+    struct tuple_element<1, CH::HashMapCellWithSavedHash<Key, TMapped, Hash, TState>> { using type = TMapped; };
+}
diff --git a/ydb/library/arrow_clickhouse/Common/HashTable/HashSet.h b/ydb/library/arrow_clickhouse/Common/HashTable/HashSet.h
new file mode 100644
index 00000000000..0f349e5d9c3
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Common/HashTable/HashSet.h
@@ -0,0 +1,124 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+
+#include <Common/HashTable/Hash.h>
+#include <Common/HashTable/HashTable.h>
+#include <Common/HashTable/HashTableAllocator.h>
+
+/** NOTE HashSet could only be used for memmoveable (position independent) types.
+  * Example: std::string is not position independent in libstdc++ with C++11 ABI or in libc++.
+  * Also, key must be of type, that zero bytes is compared equals to zero key.
+  */
+
+namespace CH
+{
+
+template
+<
+    typename Key,
+    typename TCell,
+    typename Hash = DefaultHash<Key>,
+    typename Grower = HashTableGrower<>,
+    typename Allocator = HashTableAllocator
+>
+class HashSetTable : public HashTable<Key, TCell, Hash, Grower, Allocator>
+{
+public:
+    using Self = HashSetTable;
+    using Cell = TCell;
+
+    using Base = HashTable<Key, TCell, Hash, Grower, Allocator>;
+    using typename Base::LookupResult;
+
+    void merge(const Self & rhs)
+    {
+        if (!this->hasZero() && rhs.hasZero())
+        {
+            this->setHasZero();
+            ++this->m_size;
+        }
+
+        for (size_t i = 0; i < rhs.grower.bufSize(); ++i)
+            if (!rhs.buf[i].isZero(*this))
+                this->insert(rhs.buf[i].getValue());
+    }
+
+#if 0
+    void readAndMerge(DB::ReadBuffer & rb)
+    {
+        Cell::State::read(rb);
+
+        size_t new_size = 0;
+        DB::readVarUInt(new_size, rb);
+
+        this->resize(new_size);
+
+        for (size_t i = 0; i < new_size; ++i)
+        {
+            Cell x;
+            x.read(rb);
+            this->insert(x.getValue());
+        }
+    }
+#endif
+};
+
+
+template <typename Key, typename Hash, typename TState = HashTableNoState>
+struct HashSetCellWithSavedHash : public HashTableCell<Key, Hash, TState>
+{
+    using Base = HashTableCell<Key, Hash, TState>;
+
+    size_t saved_hash;
+
+    HashSetCellWithSavedHash() : Base() {} //-V730
+    HashSetCellWithSavedHash(const Key & key_, const typename Base::State & state) : Base(key_, state) {} //-V730
+
+    bool keyEquals(const Key & key_) const { return bitEquals(this->key, key_); }
+    bool keyEquals(const Key & key_, size_t hash_) const { return saved_hash == hash_ && bitEquals(this->key, key_); }
+    bool keyEquals(const Key & key_, size_t hash_, const typename Base::State &) const { return keyEquals(key_, hash_); }
+
+    void setHash(size_t hash_value) { saved_hash = hash_value; }
+    size_t getHash(const Hash & /*hash_function*/) const { return saved_hash; }
+};
+
+template
+<
+    typename Key,
+    typename Hash = DefaultHash<Key>,
+    typename Grower = HashTableGrower<>,
+    typename Allocator = HashTableAllocator
+>
+using HashSet = HashSetTable<Key, HashTableCell<Key, Hash>, Hash, Grower, Allocator>;
+
+template <typename Key, typename Hash, size_t initial_size_degree>
+using HashSetWithStackMemory = HashSet<
+    Key,
+    Hash,
+    HashTableGrower<initial_size_degree>,
+    HashTableAllocatorWithStackMemory<
+        (1ULL << initial_size_degree)
+        * sizeof(HashTableCell<Key, Hash>)>>;
+
+template
+<
+    typename Key,
+    typename Hash = DefaultHash<Key>,
+    typename Grower = HashTableGrower<>,
+    typename Allocator = HashTableAllocator
+>
+using HashSetWithSavedHash = HashSetTable<Key, HashSetCellWithSavedHash<Key, Hash>, Hash, Grower, Allocator>;
+
+template <typename Key, typename Hash, size_t initial_size_degree>
+using HashSetWithSavedHashWithStackMemory = HashSetWithSavedHash<
+    Key,
+    Hash,
+    HashTableGrower<initial_size_degree>,
+    HashTableAllocatorWithStackMemory<
+        (1ULL << initial_size_degree)
+        * sizeof(HashSetCellWithSavedHash<Key, Hash>)>>;
+
+}
diff --git a/ydb/library/arrow_clickhouse/Common/HashTable/HashTable.h b/ydb/library/arrow_clickhouse/Common/HashTable/HashTable.h
new file mode 100644
index 00000000000..98fe66a0df3
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Common/HashTable/HashTable.h
@@ -0,0 +1,1311 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+
+#include <string.h>
+#include <math.h>
+
+#include <new>
+#include <utility>
+
+//#include <boost/noncopyable.hpp>
+
+#include <common/types.h>
+
+#include <Common/HashTable/HashTableAllocator.h>
+#include <Common/HashTable/HashTableKeyHolder.h>
+
+#ifdef DBMS_HASH_MAP_DEBUG_RESIZES
+    #include <iostream>
+    #include <iomanip>
+#endif
+
+/** NOTE HashTable could only be used for memmoveable (position independent) types.
+  * Example: std::string is not position independent in libstdc++ with C++11 ABI or in libc++.
+  * Also, key in hash table must be of type, that zero bytes is compared equals to zero key.
+  */
+
+namespace CH
+{
+
+/** The state of the hash table that affects the properties of its cells.
+  * Used as a template parameter.
+  * For example, there is an implementation of an instantly clearable hash table - ClearableHashMap.
+  * For it, each cell holds the version number, and in the hash table itself is the current version.
+  *  When clearing, the current version simply increases; All cells with a mismatching version are considered empty.
+  *  Another example: for an approximate calculation of the number of unique visitors, there is a hash table for UniquesHashSet.
+  *  It has the concept of "degree". At each overflow, cells with keys that do not divide by the corresponding power of the two are deleted.
+  */
+struct HashTableNoState
+{
+#if 0
+    /// Serialization, in binary and text form.
+    void write(DB::WriteBuffer &) const         {}
+    void writeText(DB::WriteBuffer &) const     {}
+
+    /// Deserialization, in binary and text form.
+    void read(DB::ReadBuffer &)                 {}
+    void readText(DB::ReadBuffer &)             {}
+#endif
+};
+
+
+/// These functions can be overloaded for custom types.
+namespace ZeroTraits
+{
+
+template <typename T>
+bool check(const T x) { return x == T{}; }
+
+template <typename T>
+void set(T & x) { x = {}; }
+
+}
+
+
+/** Numbers are compared bitwise.
+  * Complex types are compared by operator== as usual (this is important if there are gaps).
+  *
+  * This is needed if you use floats as keys. They are compared by bit equality.
+  * Otherwise the invariants in hash table probing do not met when NaNs are present.
+  */
+template <typename T>
+inline bool bitEquals(T && a, T && b)
+{
+    using RealT = std::decay_t<T>;
+
+    if constexpr (std::is_floating_point_v<RealT>)
+        return 0 == memcmp(&a, &b, sizeof(RealT));  /// Note that memcmp with constant size is compiler builtin.
+    else
+        return a == b;
+}
+
+
+/**
+  * getKey/Mapped -- methods to get key/"mapped" values from the LookupResult returned by find() and
+  * emplace() methods of HashTable. Must not be called for a null LookupResult.
+  *
+  * We don't use iterators for lookup result. Instead, LookupResult is a pointer of some kind. There
+  * are methods getKey/Mapped, that return references or values to key/"mapped" values.
+  *
+  * Different hash table implementations support this interface to a varying degree:
+  *
+  * 1) Hash tables that store neither the key in its original form, nor a "mapped" value:
+  *    FixedHashTable or StringHashTable. Neither GetKey nor GetMapped are supported, the only valid
+  *    operation is checking LookupResult for null.
+  *
+  * 2) Hash maps that do not store the key, e.g. FixedHashMap or StringHashMap. Only GetMapped is
+  *    supported.
+  *
+  * 3) Hash tables that store the key and do not have a "mapped" value, e.g. the normal HashTable.
+  *    GetKey returns the key, and GetMapped returns a zero void pointer. This simplifies generic
+  *    code that works with mapped values: it can overload on the return type of GetMapped(), and
+  *    doesn't need other parameters. One example is insertSetMapped() function.
+  *
+  * 4) Hash tables that store both the key and the "mapped" value, e.g. HashMap. Both GetKey and
+  *    GetMapped are supported.
+  *
+  * The implementation side goes as follows:
+  *
+  * for (1), LookupResult->getKey = const VoidKey, LookupResult->getMapped = VoidMapped;
+  *
+  * for (2), LookupResult->getKey = const VoidKey, LookupResult->getMapped = Mapped &;
+  *
+  * for (3) and (4), LookupResult->getKey = const Key [&], LookupResult->getMapped = Mapped &;
+  * VoidKey and VoidMapped may have specialized function overloads for generic code.
+  */
+
+struct VoidKey {};
+struct VoidMapped
+{
+    template <typename T>
+    auto & operator=(const T &)
+    {
+        return *this;
+    }
+};
+
+/** Compile-time interface for cell of the hash table.
+  * Different cell types are used to implement different hash tables.
+  * The cell must contain a key.
+  * It can also contain a value and arbitrary additional data
+  *  (example: the stored hash value; version number for ClearableHashMap).
+  */
+template <typename Key, typename Hash, typename TState = HashTableNoState>
+struct HashTableCell
+{
+    using State = TState;
+
+    using key_type = Key;
+    using value_type = Key;
+    using mapped_type = VoidMapped;
+
+    Key key;
+
+    HashTableCell() {}
+
+    /// Create a cell with the given key / key and value.
+    HashTableCell(const Key & key_, const State &) : key(key_) {}
+
+    /// Get the key (externally).
+    const Key & getKey() const { return key; }
+    VoidMapped getMapped() const { return {}; }
+    const value_type & getValue() const { return key; }
+
+    /// Get the key (internally).
+    static const Key & getKey(const value_type & value) { return value; }
+
+    /// Are the keys at the cells equal?
+    bool keyEquals(const Key & key_) const { return bitEquals(key, key_); }
+    bool keyEquals(const Key & key_, size_t /*hash_*/) const { return bitEquals(key, key_); }
+    bool keyEquals(const Key & key_, size_t /*hash_*/, const State & /*state*/) const { return bitEquals(key, key_); }
+
+    /// If the cell can remember the value of the hash function, then remember it.
+    void setHash(size_t /*hash_value*/) {}
+
+    /// If the cell can store the hash value in itself, then return the stored value.
+    /// It must be at least once calculated before.
+    /// If storing the hash value is not provided, then just compute the hash.
+    size_t getHash(const Hash & hash) const { return hash(key); }
+
+    /// Whether the key is zero. In the main buffer, cells with a zero key are considered empty.
+    /// If zero keys can be inserted into the table, then the cell for the zero key is stored separately, not in the main buffer.
+    /// Zero keys must be such that the zeroed-down piece of memory is a zero key.
+    bool isZero(const State & state) const { return isZero(key, state); }
+    static bool isZero(const Key & key, const State & /*state*/) { return ZeroTraits::check(key); }
+
+    /// Set the key value to zero.
+    void setZero() { ZeroTraits::set(key); }
+
+    /// Do the hash table need to store the zero key separately (that is, can a zero key be inserted into the hash table).
+    static constexpr bool need_zero_value_storage = true;
+
+    /// Set the mapped value, if any (for HashMap), to the corresponding `value`.
+    void setMapped(const value_type & /*value*/) {}
+#if 0
+    /// Serialization, in binary and text form.
+    void write(DB::WriteBuffer & wb) const         { DB::writeBinary(key, wb); }
+    void writeText(DB::WriteBuffer & wb) const     { DB::writeDoubleQuoted(key, wb); }
+
+    /// Deserialization, in binary and text form.
+    void read(DB::ReadBuffer & rb)        { DB::readBinary(key, rb); }
+    void readText(DB::ReadBuffer & rb)    { DB::readDoubleQuoted(key, rb); }
+#endif
+    /// When cell pointer is moved during erase, reinsert or resize operations
+
+    static constexpr bool need_to_notify_cell_during_move = false;
+
+    static void move(HashTableCell * /* old_location */, HashTableCell * /* new_location */) {}
+
+};
+
+/**
+  * A helper function for HashTable::insert() to set the "mapped" value.
+  * Overloaded on the mapped type, does nothing if it's VoidMapped.
+  */
+template <typename ValueType>
+void insertSetMapped(VoidMapped /* dest */, const ValueType & /* src */) {}
+
+template <typename MappedType, typename ValueType>
+void insertSetMapped(MappedType & dest, const ValueType & src) { dest = src.second; }
+
+
+/** Determines the size of the hash table, and when and how much it should be resized.
+  */
+template <size_t initial_size_degree = 8>
+struct HashTableGrower
+{
+    /// The state of this structure is enough to get the buffer size of the hash table.
+
+    UInt8 size_degree = initial_size_degree;
+    static constexpr auto initial_count = 1ULL << initial_size_degree;
+
+    /// If collision resolution chains are contiguous, we can implement erase operation by moving the elements.
+    static constexpr auto performs_linear_probing_with_single_step = true;
+
+    /// The size of the hash table in the cells.
+    size_t bufSize() const               { return 1ULL << size_degree; }
+
+    size_t maxFill() const               { return 1ULL << (size_degree - 1); }
+    size_t mask() const                  { return bufSize() - 1; }
+
+    /// From the hash value, get the cell number in the hash table.
+    size_t place(size_t x) const         { return x & mask(); }
+
+    /// The next cell in the collision resolution chain.
+    size_t next(size_t pos) const        { ++pos; return pos & mask(); }
+
+    /// Whether the hash table is sufficiently full. You need to increase the size of the hash table, or remove something unnecessary from it.
+    bool overflow(size_t elems) const    { return elems > maxFill(); }
+
+    /// Increase the size of the hash table.
+    void increaseSize()
+    {
+        size_degree += size_degree >= 23 ? 1 : 2;
+    }
+
+    /// Set the buffer size by the number of elements in the hash table. Used when deserializing a hash table.
+    void set(size_t num_elems)
+    {
+        size_degree = num_elems <= 1
+             ? initial_size_degree
+             : ((initial_size_degree > static_cast<size_t>(log2(num_elems - 1)) + 2)
+                 ? initial_size_degree
+                 : (static_cast<size_t>(log2(num_elems - 1)) + 2));
+    }
+
+    void setBufSize(size_t buf_size_)
+    {
+        size_degree = static_cast<size_t>(log2(buf_size_ - 1) + 1);
+    }
+};
+
+
+/** When used as a Grower, it turns a hash table into something like a lookup table.
+  * It remains non-optimal - the cells store the keys.
+  * Also, the compiler can not completely remove the code of passing through the collision resolution chain, although it is not needed.
+  * NOTE: Better to use FixedHashTable instead.
+  */
+template <size_t key_bits>
+struct HashTableFixedGrower
+{
+    static constexpr auto initial_count = 1ULL << key_bits;
+
+    static constexpr auto performs_linear_probing_with_single_step = true;
+
+    size_t bufSize() const               { return 1ULL << key_bits; }
+    size_t place(size_t x) const         { return x; }
+    /// You could write __builtin_unreachable(), but the compiler does not optimize everything, and it turns out less efficiently.
+    size_t next(size_t pos) const        { return pos + 1; }
+    bool overflow(size_t /*elems*/) const { return false; }
+
+    void increaseSize() { __builtin_unreachable(); }
+    void set(size_t /*num_elems*/) {}
+    void setBufSize(size_t /*buf_size_*/) {}
+};
+
+
+/** If you want to store the zero key separately - a place to store it. */
+template <bool need_zero_value_storage, typename Cell>
+struct ZeroValueStorage;
+
+template <typename Cell>
+struct ZeroValueStorage<true, Cell> //-V730
+{
+private:
+    bool has_zero = false;
+    std::aligned_storage_t<sizeof(Cell), alignof(Cell)> zero_value_storage; /// Storage of element with zero key.
+
+public:
+    bool hasZero() const { return has_zero; }
+
+    void setHasZero()
+    {
+        has_zero = true;
+        new (zeroValue()) Cell();
+    }
+
+    void clearHasZero()
+    {
+        has_zero = false;
+        zeroValue()->~Cell();
+    }
+
+    Cell * zeroValue()             { return std::launder(reinterpret_cast<Cell*>(&zero_value_storage)); }
+    const Cell * zeroValue() const { return std::launder(reinterpret_cast<const Cell*>(&zero_value_storage)); }
+};
+
+template <typename Cell>
+struct ZeroValueStorage<false, Cell>
+{
+    bool hasZero() const { return false; }
+    void setHasZero() { throw std::runtime_error("HashTable: logical error"); }
+    void clearHasZero() {}
+
+    Cell * zeroValue()             { return nullptr; }
+    const Cell * zeroValue() const { return nullptr; }
+};
+
+
+template <bool enable, typename Allocator, typename Cell>
+struct AllocatorBufferDeleter;
+
+template <typename Allocator, typename Cell>
+struct AllocatorBufferDeleter<false, Allocator, Cell>
+{
+    AllocatorBufferDeleter(Allocator &, size_t) {}
+
+    void operator()(Cell *) const {}
+
+};
+
+template <typename Allocator, typename Cell>
+struct AllocatorBufferDeleter<true, Allocator, Cell>
+{
+    AllocatorBufferDeleter(Allocator & allocator_, size_t size_)
+        : allocator(allocator_)
+        , size(size_) {}
+
+    void operator()(Cell * buffer) const { allocator.free(buffer, size); }
+
+    Allocator & allocator;
+    size_t size;
+};
+
+
+// The HashTable
+template
+<
+    typename Key,
+    typename Cell,
+    typename Hash,
+    typename Grower,
+    typename Allocator
+>
+class HashTable :
+    //private boost::noncopyable,
+    protected Hash,
+    protected Allocator,
+    protected Cell::State,
+    protected ZeroValueStorage<Cell::need_zero_value_storage, Cell>     /// empty base optimization
+{
+public:
+    // If we use an allocator with inline memory, check that the initial
+    // size of the hash table is in sync with the amount of this memory.
+    static constexpr size_t initial_buffer_bytes
+        = Grower::initial_count * sizeof(Cell);
+    static_assert(allocatorInitialBytes<Allocator> == 0
+        || allocatorInitialBytes<Allocator> == initial_buffer_bytes);
+
+protected:
+    friend class const_iterator;
+    friend class iterator;
+    friend class Reader;
+
+    template <typename, typename, typename, typename, typename, typename, size_t>
+    friend class TwoLevelHashTable;
+
+    template <typename, typename, size_t>
+    friend class TwoLevelStringHashTable;
+
+    template <typename SubMaps>
+    friend class StringHashTable;
+
+    using HashValue = size_t;
+    using Self = HashTable;
+
+    size_t m_size = 0;        /// Amount of elements
+    Cell * buf;               /// A piece of memory for all elements except the element with zero key.
+    Grower grower;
+
+#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
+    mutable size_t collisions = 0;
+#endif
+
+    /// Find a cell with the same key or an empty cell, starting from the specified position and further along the collision resolution chain.
+    size_t ALWAYS_INLINE findCell(const Key & x, size_t hash_value, size_t place_value) const
+    {
+        while (!buf[place_value].isZero(*this) && !buf[place_value].keyEquals(x, hash_value, *this))
+        {
+            place_value = grower.next(place_value);
+#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
+            ++collisions;
+#endif
+        }
+
+        return place_value;
+    }
+
+
+    /// Find an empty cell, starting with the specified position and further along the collision resolution chain.
+    size_t ALWAYS_INLINE findEmptyCell(size_t place_value) const
+    {
+        while (!buf[place_value].isZero(*this))
+        {
+            place_value = grower.next(place_value);
+#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
+            ++collisions;
+#endif
+        }
+
+        return place_value;
+    }
+
+    void alloc(const Grower & new_grower)
+    {
+        buf = reinterpret_cast<Cell *>(Allocator::alloc(new_grower.bufSize() * sizeof(Cell)));
+        grower = new_grower;
+    }
+
+    void free()
+    {
+        if (buf)
+        {
+            Allocator::free(buf, getBufferSizeInBytes());
+            buf = nullptr;
+        }
+    }
+
+    /// Increase the size of the buffer.
+    void resize(size_t for_num_elems = 0, size_t for_buf_size = 0)
+    {
+#ifdef DBMS_HASH_MAP_DEBUG_RESIZES
+        Stopwatch watch;
+#endif
+
+        size_t old_size = grower.bufSize();
+
+        /** In case of exception for the object to remain in the correct state,
+          *  changing the variable `grower` (which determines the buffer size of the hash table)
+          *  is postponed for a moment after a real buffer change.
+          * The temporary variable `new_grower` is used to determine the new size.
+          */
+        Grower new_grower = grower;
+
+        if (for_num_elems)
+        {
+            new_grower.set(for_num_elems);
+            if (new_grower.bufSize() <= old_size)
+                return;
+        }
+        else if (for_buf_size)
+        {
+            new_grower.setBufSize(for_buf_size);
+            if (new_grower.bufSize() <= old_size)
+                return;
+        }
+        else
+            new_grower.increaseSize();
+
+        /// Expand the space.
+
+        size_t old_buffer_size = getBufferSizeInBytes();
+
+        /** If cell required to be notified during move we need to temporary keep old buffer
+         * because realloc does not quarantee for reallocated buffer to have same base address
+         */
+        using Deleter = AllocatorBufferDeleter<Cell::need_to_notify_cell_during_move, Allocator, Cell>;
+        Deleter buffer_deleter(*this, old_buffer_size);
+        std::unique_ptr<Cell, Deleter> old_buffer(buf, buffer_deleter);
+
+        if constexpr (Cell::need_to_notify_cell_during_move)
+        {
+            buf = reinterpret_cast<Cell *>(Allocator::alloc(new_grower.bufSize() * sizeof(Cell)));
+            memcpy(reinterpret_cast<void *>(buf), reinterpret_cast<const void *>(old_buffer.get()), old_buffer_size);
+        }
+        else
+            buf = reinterpret_cast<Cell *>(Allocator::realloc(buf, old_buffer_size, new_grower.bufSize() * sizeof(Cell)));
+
+        grower = new_grower;
+
+        /** Now some items may need to be moved to a new location.
+          * The element can stay in place, or move to a new location "on the right",
+          *  or move to the left of the collision resolution chain, because the elements to the left of it have been moved to the new "right" location.
+          */
+        size_t i = 0;
+        for (; i < old_size; ++i)
+            if (!buf[i].isZero(*this))
+            {
+                size_t updated_place_value = reinsert(buf[i], buf[i].getHash(*this));
+
+                if constexpr (Cell::need_to_notify_cell_during_move)
+                    Cell::move(&(old_buffer.get())[i], &buf[updated_place_value]);
+            }
+
+        /** There is also a special case:
+          *    if the element was to be at the end of the old buffer,                  [        x]
+          *    but is at the beginning because of the collision resolution chain,      [o       x]
+          *    then after resizing, it will first be out of place again,               [        xo        ]
+          *    and in order to transfer it where necessary,
+          *    after transferring all the elements from the old halves you need to     [         o   x    ]
+          *    process tail from the collision resolution chain immediately after it   [        o    x    ]
+          */
+        size_t new_size = grower.bufSize();
+        for (; i < new_size && !buf[i].isZero(*this); ++i)
+        {
+            size_t updated_place_value = reinsert(buf[i], buf[i].getHash(*this));
+
+            if constexpr (Cell::need_to_notify_cell_during_move)
+                if (&buf[i] != &buf[updated_place_value])
+                    Cell::move(&buf[i], &buf[updated_place_value]);
+        }
+
+#ifdef DBMS_HASH_MAP_DEBUG_RESIZES
+        watch.stop();
+        std::cerr << std::fixed << std::setprecision(3)
+            << "Resize from " << old_size << " to " << grower.bufSize() << " took " << watch.elapsedSeconds() << " sec."
+            << std::endl;
+#endif
+    }
+
+
+    /** Paste into the new buffer the value that was in the old buffer.
+      * Used when increasing the buffer size.
+      */
+    size_t reinsert(Cell & x, size_t hash_value)
+    {
+        size_t place_value = grower.place(hash_value);
+
+        /// If the element is in its place.
+        if (&x == &buf[place_value])
+            return place_value;
+
+        /// Compute a new location, taking into account the collision resolution chain.
+        place_value = findCell(Cell::getKey(x.getValue()), hash_value, place_value);
+
+        /// If the item remains in its place in the old collision resolution chain.
+        if (!buf[place_value].isZero(*this))
+            return place_value;
+
+        /// Copy to a new location and zero the old one.
+        x.setHash(hash_value);
+        memcpy(static_cast<void*>(&buf[place_value]), &x, sizeof(x));
+        x.setZero();
+
+        /// Then the elements that previously were in collision with this can move to the old place.
+        return place_value;
+    }
+
+
+    void destroyElements()
+    {
+        if (!std::is_trivially_destructible_v<Cell>)
+        {
+            for (iterator it = begin(), it_end = end(); it != it_end; ++it)
+            {
+                it.ptr->~Cell();
+                /// In case of poison_in_dtor=1 it will be poisoned,
+                /// but it maybe used later, during iteration.
+                ///
+                /// NOTE, that technically this is UB [1], but OK for now.
+                ///
+                ///   [1]: https://github.com/google/sanitizers/issues/854#issuecomment-329661378
+                //__msan_unpoison(it.ptr, sizeof(*it.ptr));
+            }
+        }
+    }
+
+
+    template <typename Derived, bool is_const>
+    class iterator_base
+    {
+        using Container = std::conditional_t<is_const, const Self, Self>;
+        using cell_type = std::conditional_t<is_const, const Cell, Cell>;
+
+        Container * container;
+        cell_type * ptr;
+
+        friend class HashTable;
+
+    public:
+        iterator_base() {}
+        iterator_base(Container * container_, cell_type * ptr_) : container(container_), ptr(ptr_) {}
+
+        bool operator== (const iterator_base & rhs) const { return ptr == rhs.ptr; }
+        bool operator!= (const iterator_base & rhs) const { return ptr != rhs.ptr; }
+
+        Derived & operator++()
+        {
+            /// If iterator was pointed to ZeroValueStorage, move it to the beginning of the main buffer.
+            if (unlikely(ptr->isZero(*container)))
+                ptr = container->buf;
+            else
+                ++ptr;
+
+            /// Skip empty cells in the main buffer.
+            auto buf_end = container->buf + container->grower.bufSize();
+            while (ptr < buf_end && ptr->isZero(*container))
+                ++ptr;
+
+            return static_cast<Derived &>(*this);
+        }
+
+        auto & operator* () const { return *ptr; }
+        auto * operator->() const { return ptr; }
+
+        auto getPtr() const { return ptr; }
+        size_t getHash() const { return ptr->getHash(*container); }
+
+        size_t getCollisionChainLength() const
+        {
+            return container->grower.place((ptr - container->buf) - container->grower.place(getHash()));
+        }
+
+        /**
+          * A hack for HashedDictionary.
+          *
+          * The problem: std-like find() returns an iterator, which has to be
+          * compared to end(). On the other hand, HashMap::find() returns
+          * LookupResult, which is compared to nullptr. HashedDictionary has to
+          * support both hash maps with the same code, hence the need for this
+          * hack.
+          *
+          * The proper way would be to remove iterator interface from our
+          * HashMap completely, change all its users to the existing internal
+          * iteration interface, and redefine end() to return LookupResult for
+          * compatibility with std find(). Unfortunately, now is not the time to
+          * do this.
+          */
+        operator Cell * () const { return nullptr; }
+    };
+
+
+public:
+    using key_type = Key;
+    using mapped_type = typename Cell::mapped_type;
+    using value_type = typename Cell::value_type;
+    using cell_type = Cell;
+
+    using LookupResult = Cell *;
+    using ConstLookupResult = const Cell *;
+
+    size_t hash(const Key & x) const { return Hash::operator()(x); }
+
+
+    HashTable()
+    {
+        if (Cell::need_zero_value_storage)
+            this->zeroValue()->setZero();
+        alloc(grower);
+    }
+
+    HashTable(size_t reserve_for_num_elements)
+    {
+        if (Cell::need_zero_value_storage)
+            this->zeroValue()->setZero();
+        grower.set(reserve_for_num_elements);
+        alloc(grower);
+    }
+
+    HashTable(HashTable && rhs)
+        : buf(nullptr)
+    {
+        *this = std::move(rhs);
+    }
+
+    ~HashTable()
+    {
+        destroyElements();
+        free();
+    }
+
+    HashTable & operator= (HashTable && rhs)
+    {
+        destroyElements();
+        free();
+
+        std::swap(buf, rhs.buf);
+        std::swap(m_size, rhs.m_size);
+        std::swap(grower, rhs.grower);
+
+        Hash::operator=(std::move(rhs));
+        Allocator::operator=(std::move(rhs));
+        Cell::State::operator=(std::move(rhs));
+        ZeroValueStorage<Cell::need_zero_value_storage, Cell>::operator=(std::move(rhs));
+
+        return *this;
+    }
+#if 0
+    class Reader final : private Cell::State
+    {
+    public:
+        Reader(DB::ReadBuffer & in_)
+            : in(in_)
+        {
+        }
+
+        Reader(const Reader &) = delete;
+        Reader & operator=(const Reader &) = delete;
+
+        bool next()
+        {
+            if (!is_initialized)
+            {
+                Cell::State::read(in);
+                DB::readVarUInt(size, in);
+                is_initialized = true;
+            }
+
+            if (read_count == size)
+            {
+                is_eof = true;
+                return false;
+            }
+
+            cell.read(in);
+            ++read_count;
+
+            return true;
+        }
+
+        inline const value_type & get() const
+        {
+            if (!is_initialized || is_eof)
+                throw DB::Exception("No available data");
+
+            return cell.getValue();
+        }
+
+    private:
+        DB::ReadBuffer & in;
+        Cell cell;
+        size_t read_count = 0;
+        size_t size = 0;
+        bool is_eof = false;
+        bool is_initialized = false;
+    };
+#endif
+
+    class iterator : public iterator_base<iterator, false>
+    {
+    public:
+        using iterator_base<iterator, false>::iterator_base;
+    };
+
+    class const_iterator : public iterator_base<const_iterator, true>
+    {
+    public:
+        using iterator_base<const_iterator, true>::iterator_base;
+    };
+
+
+    const_iterator begin() const
+    {
+        if (!buf)
+            return end();
+
+        if (this->hasZero())
+            return iteratorToZero();
+
+        const Cell * ptr = buf;
+        auto buf_end = buf + grower.bufSize();
+        while (ptr < buf_end && ptr->isZero(*this))
+            ++ptr;
+
+        return const_iterator(this, ptr);
+    }
+
+    const_iterator cbegin() const { return begin(); }
+
+    iterator begin()
+    {
+        if (!buf)
+            return end();
+
+        if (this->hasZero())
+            return iteratorToZero();
+
+        Cell * ptr = buf;
+        auto buf_end = buf + grower.bufSize();
+        while (ptr < buf_end && ptr->isZero(*this))
+            ++ptr;
+
+        return iterator(this, ptr);
+    }
+
+    const_iterator end() const
+    {
+        /// Avoid UBSan warning about adding zero to nullptr. It is valid in C++20 (and earlier) but not valid in C.
+        return const_iterator(this, buf ? buf + grower.bufSize() : buf);
+    }
+
+    const_iterator cend() const
+    {
+        return end();
+    }
+
+    iterator end()
+    {
+        return iterator(this, buf ? buf + grower.bufSize() : buf);
+    }
+
+
+protected:
+    const_iterator iteratorTo(const Cell * ptr) const { return const_iterator(this, ptr); }
+    iterator iteratorTo(Cell * ptr)                   { return iterator(this, ptr); }
+    const_iterator iteratorToZero() const             { return iteratorTo(this->zeroValue()); }
+    iterator iteratorToZero()                         { return iteratorTo(this->zeroValue()); }
+
+
+    /// If the key is zero, insert it into a special place and return true.
+    /// We don't have to persist a zero key, because it's not actually inserted.
+    /// That's why we just take a Key by value, an not a key holder.
+    bool ALWAYS_INLINE emplaceIfZero(const Key & x, LookupResult & it, bool & inserted, size_t hash_value)
+    {
+        /// If it is claimed that the zero key can not be inserted into the table.
+        if (!Cell::need_zero_value_storage)
+            return false;
+
+        if (Cell::isZero(x, *this))
+        {
+            it = this->zeroValue();
+
+            if (!this->hasZero())
+            {
+                ++m_size;
+                this->setHasZero();
+                this->zeroValue()->setHash(hash_value);
+                inserted = true;
+            }
+            else
+                inserted = false;
+
+            return true;
+        }
+
+        return false;
+    }
+
+    template <typename KeyHolder>
+    void ALWAYS_INLINE emplaceNonZeroImpl(size_t place_value, KeyHolder && key_holder,
+                                          LookupResult & it, bool & inserted, size_t hash_value)
+    {
+        it = &buf[place_value];
+
+        if (!buf[place_value].isZero(*this))
+        {
+            keyHolderDiscardKey(key_holder);
+            inserted = false;
+            return;
+        }
+
+        keyHolderPersistKey(key_holder);
+        const auto & key = keyHolderGetKey(key_holder);
+
+        new (&buf[place_value]) Cell(key, *this);
+        buf[place_value].setHash(hash_value);
+        inserted = true;
+        ++m_size;
+
+        if (unlikely(grower.overflow(m_size)))
+        {
+            try
+            {
+                resize();
+            }
+            catch (...)
+            {
+                /** If we have not resized successfully, then there will be problems.
+                  * There remains a key, but uninitialized mapped-value,
+                  *  which, perhaps, can not even be called a destructor.
+                  */
+                --m_size;
+                buf[place_value].setZero();
+                inserted = false;
+                throw;
+            }
+
+            // The hash table was rehashed, so we have to re-find the key.
+            size_t new_place = findCell(key, hash_value, grower.place(hash_value));
+            assert(!buf[new_place].isZero(*this));
+            it = &buf[new_place];
+        }
+    }
+
+    /// Only for non-zero keys. Find the right place, insert the key there, if it does not already exist. Set iterator to the cell in output parameter.
+    template <typename KeyHolder>
+    void ALWAYS_INLINE emplaceNonZero(KeyHolder && key_holder, LookupResult & it,
+                                      bool & inserted, size_t hash_value)
+    {
+        const auto & key = keyHolderGetKey(key_holder);
+        size_t place_value = findCell(key, hash_value, grower.place(hash_value));
+        emplaceNonZeroImpl(place_value, key_holder, it, inserted, hash_value);
+    }
+
+
+public:
+    void reserve(size_t num_elements)
+    {
+        resize(num_elements);
+    }
+
+    /// Insert a value. In the case of any more complex values, it is better to use the `emplace` function.
+    std::pair<LookupResult, bool> ALWAYS_INLINE insert(const value_type & x)
+    {
+        std::pair<LookupResult, bool> res;
+
+        size_t hash_value = hash(Cell::getKey(x));
+        if (!emplaceIfZero(Cell::getKey(x), res.first, res.second, hash_value))
+        {
+            emplaceNonZero(Cell::getKey(x), res.first, res.second, hash_value);
+        }
+
+        if (res.second)
+            insertSetMapped(res.first->getMapped(), x);
+
+        return res;
+    }
+
+
+    /// Reinsert node pointed to by iterator
+    void ALWAYS_INLINE reinsert(iterator & it, size_t hash_value)
+    {
+        size_t place_value = reinsert(*it.getPtr(), hash_value);
+
+        if constexpr (Cell::need_to_notify_cell_during_move)
+            if (it.getPtr() != &buf[place_value])
+                Cell::move(it.getPtr(), &buf[place_value]);
+    }
+
+
+    /** Insert the key.
+      * Return values:
+      * 'it' -- a LookupResult pointing to the corresponding key/mapped pair.
+      * 'inserted' -- whether a new key was inserted.
+      *
+      * You have to make `placement new` of value if you inserted a new key,
+      * since when destroying a hash table, it will call the destructor!
+      *
+      * Example usage:
+      *
+      * Map::LookupResult it;
+      * bool inserted;
+      * map.emplace(key, it, inserted);
+      * if (inserted)
+      *     new (&it->getMapped()) Mapped(value);
+      */
+    template <typename KeyHolder>
+    void ALWAYS_INLINE emplace(KeyHolder && key_holder, LookupResult & it, bool & inserted)
+    {
+        const auto & key = keyHolderGetKey(key_holder);
+        emplace(key_holder, it, inserted, hash(key));
+    }
+
+    template <typename KeyHolder>
+    void ALWAYS_INLINE emplace(KeyHolder && key_holder, LookupResult & it,
+                                  bool & inserted, size_t hash_value)
+    {
+        const auto & key = keyHolderGetKey(key_holder);
+        if (!emplaceIfZero(key, it, inserted, hash_value))
+            emplaceNonZero(key_holder, it, inserted, hash_value);
+    }
+
+    /// Copy the cell from another hash table. It is assumed that the cell is not zero, and also that there was no such key in the table yet.
+    void ALWAYS_INLINE insertUniqueNonZero(const Cell * cell, size_t hash_value)
+    {
+        size_t place_value = findEmptyCell(grower.place(hash_value));
+
+        memcpy(static_cast<void*>(&buf[place_value]), cell, sizeof(*cell));
+        ++m_size;
+
+        if (unlikely(grower.overflow(m_size)))
+            resize();
+    }
+
+    LookupResult ALWAYS_INLINE find(const Key & x)
+    {
+        if (Cell::isZero(x, *this))
+            return this->hasZero() ? this->zeroValue() : nullptr;
+
+        size_t hash_value = hash(x);
+        size_t place_value = findCell(x, hash_value, grower.place(hash_value));
+        return !buf[place_value].isZero(*this) ? &buf[place_value] : nullptr;
+    }
+
+    ConstLookupResult ALWAYS_INLINE find(const Key & x) const
+    {
+        return const_cast<std::decay_t<decltype(*this)> *>(this)->find(x);
+    }
+
+    LookupResult ALWAYS_INLINE find(const Key & x, size_t hash_value)
+    {
+        if (Cell::isZero(x, *this))
+            return this->hasZero() ? this->zeroValue() : nullptr;
+
+        size_t place_value = findCell(x, hash_value, grower.place(hash_value));
+        return !buf[place_value].isZero(*this) ? &buf[place_value] : nullptr;
+    }
+
+    ConstLookupResult ALWAYS_INLINE find(const Key & x, size_t hash_value) const
+    {
+        return const_cast<std::decay_t<decltype(*this)> *>(this)->find(x, hash_value);
+    }
+
+    std::enable_if_t<Grower::performs_linear_probing_with_single_step, bool>
+    ALWAYS_INLINE erase(const Key & x)
+    {
+        return erase(x, hash(x));
+    }
+
+    std::enable_if_t<Grower::performs_linear_probing_with_single_step, bool>
+    ALWAYS_INLINE erase(const Key & x, size_t hash_value)
+    {
+        /** Deletion from open addressing hash table without tombstones
+          *
+          * https://en.wikipedia.org/wiki/Linear_probing
+          * https://en.wikipedia.org/wiki/Open_addressing
+          * Algorithm without recomputing hash but keep probes difference value (difference of natural cell position and inserted one)
+          * in cell https://arxiv.org/ftp/arxiv/papers/0909/0909.2547.pdf
+          *
+          * Currently we use algorithm with hash recomputing on each step from https://en.wikipedia.org/wiki/Open_addressing
+          */
+
+        if (Cell::isZero(x, *this))
+        {
+            if (this->hasZero())
+            {
+                --m_size;
+                this->clearHasZero();
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+
+        size_t erased_key_position = findCell(x, hash_value, grower.place(hash_value));
+
+        /// Key is not found
+        if (buf[erased_key_position].isZero(*this))
+            return false;
+
+        /// We need to guarantee loop termination because there will be empty position
+        assert(m_size < grower.bufSize());
+
+        size_t next_position = erased_key_position;
+
+        /**
+         * During element deletion there is a possibility that the search will be broken for one
+         * of the following elements, because this place erased_key_position is empty. We will check
+         * next_element. Consider a sequence from (erased_key_position, next_element], if the
+         * optimal_position of next_element falls into it, then removing erased_key_position
+         * will not break search for next_element.
+         * If optimal_position of the element does not fall into the sequence (erased_key_position, next_element]
+         * then deleting a erased_key_position will break search for it, so we need to move next_element
+         * to erased_key_position. Now we have empty place at next_element, so we apply the identical
+         * procedure for it.
+         * If an empty element is encountered then means that there is no more next elements for which we can
+         * break the search so we can exit.
+        */
+
+        /// Walk to the right through collision resolution chain and move elements to better positions
+        while (true)
+        {
+            next_position = grower.next(next_position);
+
+            /// If there's no more elements in the chain
+            if (buf[next_position].isZero(*this))
+                break;
+
+            /// The optimal position of the element in the cell at next_position
+            size_t optimal_position = grower.place(buf[next_position].getHash(*this));
+
+            /// If position of this element is already optimal - proceed to the next element.
+            if (optimal_position == next_position)
+                continue;
+
+            /// Cannot move this element because optimal position is after the freed place
+            /// The second condition is tricky - if the chain was overlapped before erased_key_position,
+            ///  and the optimal position is actually before in collision resolution chain:
+            ///
+            /// [*xn***----------------***]
+            ///   ^^-next elem          ^
+            ///   |                     |
+            ///   erased elem           the optimal position of the next elem
+            ///
+            /// so, the next elem should be moved to position of erased elem
+
+            /// The case of non overlapping part of chain
+            if (next_position > erased_key_position
+               && (optimal_position > erased_key_position) && (optimal_position < next_position))
+            {
+                continue;
+            }
+
+            /// The case of overlapping chain
+            if (next_position < erased_key_position
+                /// Cannot move this element because optimal position is after the freed place
+                && ((optimal_position > erased_key_position) || (optimal_position < next_position)))
+            {
+                continue;
+            }
+
+            /// Move the element to the freed place
+            memcpy(static_cast<void *>(&buf[erased_key_position]), static_cast<void *>(&buf[next_position]), sizeof(Cell));
+
+            if constexpr (Cell::need_to_notify_cell_during_move)
+                Cell::move(&buf[next_position], &buf[erased_key_position]);
+
+            /// Now we have another freed place
+            erased_key_position = next_position;
+        }
+
+        buf[erased_key_position].setZero();
+        --m_size;
+
+        return true;
+    }
+
+    bool ALWAYS_INLINE has(const Key & x) const
+    {
+        if (Cell::isZero(x, *this))
+            return this->hasZero();
+
+        size_t hash_value = hash(x);
+        size_t place_value = findCell(x, hash_value, grower.place(hash_value));
+        return !buf[place_value].isZero(*this);
+    }
+
+
+    bool ALWAYS_INLINE has(const Key & x, size_t hash_value) const
+    {
+        if (Cell::isZero(x, *this))
+            return this->hasZero();
+
+        size_t place_value = findCell(x, hash_value, grower.place(hash_value));
+        return !buf[place_value].isZero(*this);
+    }
+
+#if 0
+    void write(DB::WriteBuffer & wb) const
+    {
+        Cell::State::write(wb);
+        DB::writeVarUInt(m_size, wb);
+
+        if (this->hasZero())
+            this->zeroValue()->write(wb);
+
+        if (!buf)
+            return;
+
+        for (auto ptr = buf, buf_end = buf + grower.bufSize(); ptr < buf_end; ++ptr)
+            if (!ptr->isZero(*this))
+                ptr->write(wb);
+    }
+
+    void writeText(DB::WriteBuffer & wb) const
+    {
+        Cell::State::writeText(wb);
+        DB::writeText(m_size, wb);
+
+        if (this->hasZero())
+        {
+            DB::writeChar(',', wb);
+            this->zeroValue()->writeText(wb);
+        }
+
+        if (!buf)
+            return;
+
+        for (auto ptr = buf, buf_end = buf + grower.bufSize(); ptr < buf_end; ++ptr)
+        {
+            if (!ptr->isZero(*this))
+            {
+                DB::writeChar(',', wb);
+                ptr->writeText(wb);
+            }
+        }
+    }
+
+    void read(DB::ReadBuffer & rb)
+    {
+        Cell::State::read(rb);
+
+        destroyElements();
+        this->clearHasZero();
+        m_size = 0;
+
+        size_t new_size = 0;
+        DB::readVarUInt(new_size, rb);
+
+        free();
+        Grower new_grower = grower;
+        new_grower.set(new_size);
+        alloc(new_grower);
+
+        for (size_t i = 0; i < new_size; ++i)
+        {
+            Cell x;
+            x.read(rb);
+            insert(x.getValue());
+        }
+    }
+
+    void readText(DB::ReadBuffer & rb)
+    {
+        Cell::State::readText(rb);
+
+        destroyElements();
+        this->clearHasZero();
+        m_size = 0;
+
+        size_t new_size = 0;
+        DB::readText(new_size, rb);
+
+        free();
+        Grower new_grower = grower;
+        new_grower.set(new_size);
+        alloc(new_grower);
+
+        for (size_t i = 0; i < new_size; ++i)
+        {
+            Cell x;
+            DB::assertChar(',', rb);
+            x.readText(rb);
+            insert(x.getValue());
+        }
+    }
+#endif
+
+    size_t size() const
+    {
+        return m_size;
+    }
+
+    bool empty() const
+    {
+        return 0 == m_size;
+    }
+
+    void clear()
+    {
+        destroyElements();
+        this->clearHasZero();
+        m_size = 0;
+
+        memset(static_cast<void*>(buf), 0, grower.bufSize() * sizeof(*buf));
+    }
+
+    /// After executing this function, the table can only be destroyed,
+    ///  and also you can use the methods `size`, `empty`, `begin`, `end`.
+    void clearAndShrink()
+    {
+        destroyElements();
+        this->clearHasZero();
+        m_size = 0;
+        free();
+    }
+
+    size_t getBufferSizeInBytes() const
+    {
+        return grower.bufSize() * sizeof(Cell);
+    }
+
+    size_t getBufferSizeInCells() const
+    {
+        return grower.bufSize();
+    }
+
+    /// Return offset for result in internal buffer.
+    /// Result can have value up to `getBufferSizeInCells() + 1`
+    /// because offset for zero value considered to be 0
+    /// and for other values it will be `offset in buffer + 1`
+    size_t offsetInternal(ConstLookupResult ptr) const
+    {
+        if (ptr->isZero(*this))
+            return 0;
+        return ptr - buf + 1;
+    }
+
+#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
+    size_t getCollisions() const
+    {
+        return collisions;
+    }
+#endif
+};
+
+}
diff --git a/ydb/library/arrow_clickhouse/Common/HashTable/HashTableAllocator.h b/ydb/library/arrow_clickhouse/Common/HashTable/HashTableAllocator.h
new file mode 100644
index 00000000000..6522a5e27aa
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Common/HashTable/HashTableAllocator.h
@@ -0,0 +1,22 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+
+#include <Common/Allocator.h>
+
+namespace CH
+{
+
+/**
+  * We are going to use the entire memory we allocated when resizing a hash
+  * table, so it makes sense to pre-fault the pages so that page faults don't
+  * interrupt the resize loop. Set the allocator parameter accordingly.
+  */
+using HashTableAllocator = Allocator<true /* clear_memory */, true /* mmap_populate */>;
+
+template <size_t initial_bytes = 64>
+using HashTableAllocatorWithStackMemory = AllocatorWithStackMemory<HashTableAllocator, initial_bytes>;
+
+}
diff --git a/ydb/library/arrow_clickhouse/Common/HashTable/HashTableKeyHolder.h b/ydb/library/arrow_clickhouse/Common/HashTable/HashTableKeyHolder.h
new file mode 100644
index 00000000000..e55400103da
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Common/HashTable/HashTableKeyHolder.h
@@ -0,0 +1,127 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+
+#include <Common/Arena.h>
+
+/**
+  * In some aggregation scenarios, when adding a key to the hash table, we
+  * start with a temporary key object, and if it turns out to be a new key,
+  * we must make it persistent (e.g. copy to an Arena) and use the resulting
+  * persistent object as hash table key. This happens only for StringRef keys,
+  * because other key types are stored by value, but StringRef is a pointer-like
+  * type: the actual data are stored elsewhere. Even for StringRef, we don't
+  * make a persistent copy of the key in each of the following cases:
+  * 1) the aggregation method doesn't use temporary keys, so they're persistent
+  *    from the start;
+  * 1) the key is already present in the hash table;
+  * 3) that particular key is stored by value, e.g. a short StringRef key in
+  *    StringHashMap.
+  *
+  * In the past, the caller was responsible for making the key persistent after
+  * in was inserted. emplace() returned whether the key is new or not, so the
+  * caller only stored new keys (this is case (2) from the above list). However,
+  * now we are adding a compound hash table for StringRef keys, so case (3)
+  * appears. The decision about persistence now depends on some properties of
+  * the key, and the logic of this decision is tied to the particular hash table
+  * implementation. This means that the hash table user now doesn't have enough
+  * data and logic to make this decision by itself.
+  *
+  * To support these new requirements, we now manage key persistence by passing
+  * a special key holder to emplace(), which has the functions to make the key
+  * persistent or to discard it. emplace() then calls these functions at the
+  * appropriate moments.
+  *
+  * This approach has the following benefits:
+  * - no extra runtime branches in the caller to make the key persistent.
+  * - no additional data is stored in the hash table itself, which is important
+  *   when it's used in aggregate function states.
+  * - no overhead when the key memory management isn't needed: we just pass the
+  *   bare key without any wrapper to emplace(), and the default callbacks do
+  *   nothing.
+  *
+  * This file defines the default key persistence functions, as well as two
+  * different key holders and corresponding functions for storing StringRef
+  * keys to Arena.
+  */
+
+namespace CH
+{
+
+/**
+  * Returns the key. Can return the temporary key initially.
+  * After the call to keyHolderPersistKey(), must return the persistent key.
+  */
+template <typename Key>
+inline Key & ALWAYS_INLINE keyHolderGetKey(Key && key) { return key; }
+
+/**
+  * Make the key persistent. keyHolderGetKey() must return the persistent key
+  * after this call.
+  */
+template <typename Key>
+inline void ALWAYS_INLINE keyHolderPersistKey(Key &&) {}
+
+/**
+  * Discard the key. Calling keyHolderGetKey() is ill-defined after this.
+  */
+template <typename Key>
+inline void ALWAYS_INLINE keyHolderDiscardKey(Key &&) {}
+
+/**
+  * ArenaKeyHolder is a key holder for hash tables that serializes a StringRef
+  * key to an Arena.
+  */
+struct ArenaKeyHolder
+{
+    StringRef key;
+    Arena & pool;
+
+};
+
+inline StringRef & ALWAYS_INLINE keyHolderGetKey(CH::ArenaKeyHolder & holder)
+{
+    return holder.key;
+}
+
+inline void ALWAYS_INLINE keyHolderPersistKey(CH::ArenaKeyHolder & holder)
+{
+    // Hash table shouldn't ask us to persist a zero key
+    assert(holder.key.size > 0);
+    holder.key.data = holder.pool.insert(holder.key.data, holder.key.size);
+}
+
+inline void ALWAYS_INLINE keyHolderDiscardKey(CH::ArenaKeyHolder &)
+{
+}
+
+/** SerializedKeyHolder is a key holder for a StringRef key that is already
+  * serialized to an Arena. The key must be the last allocation in this Arena,
+  * and is discarded by rolling back the allocation.
+  */
+struct SerializedKeyHolder
+{
+    StringRef key;
+    Arena & pool;
+};
+
+inline StringRef & ALWAYS_INLINE keyHolderGetKey(CH::SerializedKeyHolder & holder)
+{
+    return holder.key;
+}
+
+inline void ALWAYS_INLINE keyHolderPersistKey(CH::SerializedKeyHolder &)
+{
+}
+
+inline void ALWAYS_INLINE keyHolderDiscardKey(CH::SerializedKeyHolder & holder)
+{
+    [[maybe_unused]] void * new_head = holder.pool.rollback(holder.key.size);
+    assert(new_head == holder.key.data);
+    holder.key.data = nullptr;
+    holder.key.size = 0;
+}
+
+}
diff --git a/ydb/library/arrow_clickhouse/Common/HashTable/StringHashMap.h b/ydb/library/arrow_clickhouse/Common/HashTable/StringHashMap.h
new file mode 100644
index 00000000000..bb6e0994c3b
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Common/HashTable/StringHashMap.h
@@ -0,0 +1,198 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+
+#include <Common/HashTable/HashMap.h>
+#include <Common/HashTable/HashTableAllocator.h>
+#include <Common/HashTable/StringHashTable.h>
+
+namespace CH
+{
+
+template <typename Key, typename TMapped>
+struct StringHashMapCell : public HashMapCell<Key, TMapped, StringHashTableHash, HashTableNoState>
+{
+    using Base = HashMapCell<Key, TMapped, StringHashTableHash, HashTableNoState>;
+    using value_type = typename Base::value_type;
+    using Base::Base;
+    static constexpr bool need_zero_value_storage = false;
+    // external
+    const StringRef getKey() const { return toStringRef(this->value.first); }
+    // internal
+    static const Key & getKey(const value_type & value_) { return value_.first; }
+};
+
+template <typename TMapped>
+struct StringHashMapCell<StringKey16, TMapped> : public HashMapCell<StringKey16, TMapped, StringHashTableHash, HashTableNoState>
+{
+    using Base = HashMapCell<StringKey16, TMapped, StringHashTableHash, HashTableNoState>;
+    using value_type = typename Base::value_type;
+    using Base::Base;
+    static constexpr bool need_zero_value_storage = false;
+    bool isZero(const HashTableNoState & state) const { return isZero(this->value.first, state); }
+
+    // Zero means unoccupied cells in hash table. Use key with last word = 0 as
+    // zero keys, because such keys are unrepresentable (no way to encode length).
+    static bool isZero(const StringKey16 & key, const HashTableNoState &) { return key.items[1] == 0; }
+    void setZero() { this->value.first.items[1] = 0; }
+
+    // external
+    const StringRef getKey() const { return toStringRef(this->value.first); }
+    // internal
+    static const StringKey16 & getKey(const value_type & value_) { return value_.first; }
+};
+
+template <typename TMapped>
+struct StringHashMapCell<StringKey24, TMapped> : public HashMapCell<StringKey24, TMapped, StringHashTableHash, HashTableNoState>
+{
+    using Base = HashMapCell<StringKey24, TMapped, StringHashTableHash, HashTableNoState>;
+    using value_type = typename Base::value_type;
+    using Base::Base;
+    static constexpr bool need_zero_value_storage = false;
+    bool isZero(const HashTableNoState & state) const { return isZero(this->value.first, state); }
+
+    // Zero means unoccupied cells in hash table. Use key with last word = 0 as
+    // zero keys, because such keys are unrepresentable (no way to encode length).
+    static bool isZero(const StringKey24 & key, const HashTableNoState &)
+    { return key.c == 0; }
+    void setZero() { this->value.first.c = 0; }
+
+    // external
+    const StringRef getKey() const { return toStringRef(this->value.first); }
+    // internal
+    static const StringKey24 & getKey(const value_type & value_) { return value_.first; }
+};
+
+template <typename TMapped>
+struct StringHashMapCell<StringRef, TMapped> : public HashMapCellWithSavedHash<StringRef, TMapped, StringHashTableHash, HashTableNoState>
+{
+    using Base = HashMapCellWithSavedHash<StringRef, TMapped, StringHashTableHash, HashTableNoState>;
+    using value_type = typename Base::value_type;
+    using Base::Base;
+    static constexpr bool need_zero_value_storage = false;
+    // external
+    using Base::getKey;
+    // internal
+    static const StringRef & getKey(const value_type & value_) { return value_.first; }
+};
+
+template <typename TMapped, typename Allocator>
+struct StringHashMapSubMaps
+{
+    using T0 = StringHashTableEmpty<StringHashMapCell<StringRef, TMapped>>;
+    using T1 = HashMapTable<StringKey8, StringHashMapCell<StringKey8, TMapped>, StringHashTableHash, StringHashTableGrower<>, Allocator>;
+    using T2 = HashMapTable<StringKey16, StringHashMapCell<StringKey16, TMapped>, StringHashTableHash, StringHashTableGrower<>, Allocator>;
+    using T3 = HashMapTable<StringKey24, StringHashMapCell<StringKey24, TMapped>, StringHashTableHash, StringHashTableGrower<>, Allocator>;
+    using Ts = HashMapTable<StringRef, StringHashMapCell<StringRef, TMapped>, StringHashTableHash, StringHashTableGrower<>, Allocator>;
+};
+
+template <typename TMapped, typename Allocator = HashTableAllocator>
+class StringHashMap : public StringHashTable<StringHashMapSubMaps<TMapped, Allocator>>
+{
+public:
+    using Key = StringRef;
+    using Base = StringHashTable<StringHashMapSubMaps<TMapped, Allocator>>;
+    using Self = StringHashMap;
+    using LookupResult = typename Base::LookupResult;
+
+    using Base::Base;
+
+    /// Merge every cell's value of current map into the destination map.
+    ///  Func should have signature void(Mapped & dst, Mapped & src, bool emplaced).
+    ///  Each filled cell in current map will invoke func once. If that map doesn't
+    ///  have a key equals to the given cell, a new cell gets emplaced into that map,
+    ///  and func is invoked with the third argument emplaced set to true. Otherwise
+    ///  emplaced is set to false.
+    template <typename Func>
+    void ALWAYS_INLINE mergeToViaEmplace(Self & that, Func && func)
+    {
+        if (this->m0.hasZero() && that.m0.hasZero())
+            func(that.m0.zeroValue()->getMapped(), this->m0.zeroValue()->getMapped(), false);
+        else if (this->m0.hasZero())
+        {
+            that.m0.setHasZero();
+            func(that.m0.zeroValue()->getMapped(), this->m0.zeroValue()->getMapped(), true);
+        }
+        this->m1.mergeToViaEmplace(that.m1, func);
+        this->m2.mergeToViaEmplace(that.m2, func);
+        this->m3.mergeToViaEmplace(that.m3, func);
+        this->ms.mergeToViaEmplace(that.ms, func);
+    }
+
+    /// Merge every cell's value of current map into the destination map via find.
+    ///  Func should have signature void(Mapped & dst, Mapped & src, bool exist).
+    ///  Each filled cell in current map will invoke func once. If that map doesn't
+    ///  have a key equals to the given cell, func is invoked with the third argument
+    ///  exist set to false. Otherwise exist is set to true.
+    template <typename Func>
+    void ALWAYS_INLINE mergeToViaFind(Self & that, Func && func)
+    {
+        if (this->m0.size() && that.m0.size())
+            func(that.m0.zeroValue()->getMapped(), this->m0.zeroValue()->getMapped(), true);
+        else if (this->m0.size())
+            func(this->m0.zeroValue()->getMapped(), this->m0.zeroValue()->getMapped(), false);
+        this->m1.mergeToViaFind(that.m1, func);
+        this->m2.mergeToViaFind(that.m2, func);
+        this->m3.mergeToViaFind(that.m3, func);
+        this->ms.mergeToViaFind(that.ms, func);
+    }
+
+    TMapped & ALWAYS_INLINE operator[](const Key & x)
+    {
+        LookupResult it;
+        bool inserted;
+        this->emplace(x, it, inserted);
+        if (inserted)
+            new (&it->getMapped()) TMapped();
+
+        return it->getMapped();
+    }
+
+    template <typename Func>
+    void ALWAYS_INLINE forEachValue(Func && func)
+    {
+        if (this->m0.size())
+        {
+            func(StringRef{}, this->m0.zeroValue()->getMapped());
+        }
+
+        for (auto & v : this->m1)
+        {
+            func(v.getKey(), v.getMapped());
+        }
+
+        for (auto & v : this->m2)
+        {
+            func(v.getKey(), v.getMapped());
+        }
+
+        for (auto & v : this->m3)
+        {
+            func(v.getKey(), v.getMapped());
+        }
+
+        for (auto & v : this->ms)
+        {
+            func(v.getKey(), v.getMapped());
+        }
+    }
+
+    template <typename Func>
+    void ALWAYS_INLINE forEachMapped(Func && func)
+    {
+        if (this->m0.size())
+            func(this->m0.zeroValue()->getMapped());
+        for (auto & v : this->m1)
+            func(v.getMapped());
+        for (auto & v : this->m2)
+            func(v.getMapped());
+        for (auto & v : this->m3)
+            func(v.getMapped());
+        for (auto & v : this->ms)
+            func(v.getMapped());
+    }
+};
+
+}
diff --git a/ydb/library/arrow_clickhouse/Common/HashTable/StringHashTable.h b/ydb/library/arrow_clickhouse/Common/HashTable/StringHashTable.h
new file mode 100644
index 00000000000..da817544822
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Common/HashTable/StringHashTable.h
@@ -0,0 +1,444 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+
+#include <Common/HashTable/HashMap.h>
+#include <Common/HashTable/HashTable.h>
+
+#include <new>
+#include <variant>
+
+namespace CH
+{
+
+using StringKey8 = UInt64;
+using StringKey16 = UInt128;
+struct StringKey24
+{
+    UInt64 a;
+    UInt64 b;
+    UInt64 c;
+
+    bool operator==(const StringKey24 rhs) const { return a == rhs.a && b == rhs.b && c == rhs.c; }
+};
+
+inline StringRef ALWAYS_INLINE toStringRef(const StringKey8 & n)
+{
+    assert(n != 0);
+    return {reinterpret_cast<const char *>(&n), 8ul - (__builtin_clzll(n) >> 3)};
+}
+inline StringRef ALWAYS_INLINE toStringRef(const StringKey16 & n)
+{
+    assert(n.items[1] != 0);
+    return {reinterpret_cast<const char *>(&n), 16ul - (__builtin_clzll(n.items[1]) >> 3)};
+}
+inline StringRef ALWAYS_INLINE toStringRef(const StringKey24 & n)
+{
+    assert(n.c != 0);
+    return {reinterpret_cast<const char *>(&n), 24ul - (__builtin_clzll(n.c) >> 3)};
+}
+
+struct StringHashTableHash
+{
+#if defined(__SSE4_2__)
+    size_t ALWAYS_INLINE operator()(StringKey8 key) const
+    {
+        size_t res = -1ULL;
+        res = _mm_crc32_u64(res, key);
+        return res;
+    }
+    size_t ALWAYS_INLINE operator()(StringKey16 key) const
+    {
+        size_t res = -1ULL;
+        res = _mm_crc32_u64(res, key.items[0]);
+        res = _mm_crc32_u64(res, key.items[1]);
+        return res;
+    }
+    size_t ALWAYS_INLINE operator()(StringKey24 key) const
+    {
+        size_t res = -1ULL;
+        res = _mm_crc32_u64(res, key.a);
+        res = _mm_crc32_u64(res, key.b);
+        res = _mm_crc32_u64(res, key.c);
+        return res;
+    }
+#else
+    size_t ALWAYS_INLINE operator()(StringKey8 key) const
+    {
+        return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 8);
+    }
+    size_t ALWAYS_INLINE operator()(StringKey16 key) const
+    {
+        return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 16);
+    }
+    size_t ALWAYS_INLINE operator()(StringKey24 key) const
+    {
+        return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 24);
+    }
+#endif
+    size_t ALWAYS_INLINE operator()(StringRef key) const
+    {
+        return StringRefHash()(key);
+    }
+};
+
+template <typename Cell>
+struct StringHashTableEmpty //-V730
+{
+    using Self = StringHashTableEmpty;
+
+    bool has_zero = false;
+    std::aligned_storage_t<sizeof(Cell), alignof(Cell)> zero_value_storage; /// Storage of element with zero key.
+
+public:
+    bool hasZero() const { return has_zero; }
+
+    void setHasZero()
+    {
+        has_zero = true;
+        new (zeroValue()) Cell();
+    }
+
+    void setHasZero(const Cell & other)
+    {
+        has_zero = true;
+        new (zeroValue()) Cell(other);
+    }
+
+    void clearHasZero()
+    {
+        has_zero = false;
+        if (!std::is_trivially_destructible_v<Cell>)
+            zeroValue()->~Cell();
+    }
+
+    Cell * zeroValue() { return std::launder(reinterpret_cast<Cell *>(&zero_value_storage)); }
+    const Cell * zeroValue() const { return std::launder(reinterpret_cast<const Cell *>(&zero_value_storage)); }
+
+    using LookupResult = Cell *;
+    using ConstLookupResult = const Cell *;
+
+    template <typename KeyHolder>
+    void ALWAYS_INLINE emplace(KeyHolder &&, LookupResult & it, bool & inserted, size_t = 0)
+    {
+        if (!hasZero())
+        {
+            setHasZero();
+            inserted = true;
+        }
+        else
+            inserted = false;
+        it = zeroValue();
+    }
+
+    template <typename Key>
+    LookupResult ALWAYS_INLINE find(const Key &, size_t = 0)
+    {
+        return hasZero() ? zeroValue() : nullptr;
+    }
+
+    template <typename Key>
+    ConstLookupResult ALWAYS_INLINE find(const Key &, size_t = 0) const
+    {
+        return hasZero() ? zeroValue() : nullptr;
+    }
+#if 0
+    void write(DB::WriteBuffer & wb) const { zeroValue()->write(wb); }
+    void writeText(DB::WriteBuffer & wb) const { zeroValue()->writeText(wb); }
+    void read(DB::ReadBuffer & rb) { zeroValue()->read(rb); }
+    void readText(DB::ReadBuffer & rb) { zeroValue()->readText(rb); }
+#endif
+    size_t size() const { return hasZero() ? 1 : 0; }
+    bool empty() const { return !hasZero(); }
+    size_t getBufferSizeInBytes() const { return sizeof(Cell); }
+    size_t getCollisions() const { return 0; }
+};
+
+template <size_t initial_size_degree = 8>
+struct StringHashTableGrower : public HashTableGrower<initial_size_degree>
+{
+    // Smooth growing for string maps
+    void increaseSize() { this->size_degree += 1; }
+};
+
+template <typename Mapped>
+struct StringHashTableLookupResult
+{
+    Mapped * mapped_ptr;
+    StringHashTableLookupResult() {}
+    StringHashTableLookupResult(Mapped * mapped_ptr_) : mapped_ptr(mapped_ptr_) {}
+    StringHashTableLookupResult(std::nullptr_t) {}
+    const VoidKey getKey() const { return {}; }
+    auto & getMapped() { return *mapped_ptr; }
+    auto & operator*() { return *this; }
+    auto & operator*() const { return *this; }
+    auto * operator->() { return this; }
+    auto * operator->() const { return this; }
+    operator bool() const { return mapped_ptr; }
+    friend bool operator==(const StringHashTableLookupResult & a, const nullptr_t &) { return !a.mapped_ptr; }
+    friend bool operator==(const std::nullptr_t &, const StringHashTableLookupResult & b) { return !b.mapped_ptr; }
+    friend bool operator!=(const StringHashTableLookupResult & a, const nullptr_t &) { return a.mapped_ptr; }
+    friend bool operator!=(const std::nullptr_t &, const StringHashTableLookupResult & b) { return b.mapped_ptr; }
+};
+
+template <typename SubMaps>
+class StringHashTable //: private boost::noncopyable
+{
+protected:
+    static constexpr size_t NUM_MAPS = 5;
+    // Map for storing empty string
+    using T0 = typename SubMaps::T0;
+
+    // Short strings are stored as numbers
+    using T1 = typename SubMaps::T1;
+    using T2 = typename SubMaps::T2;
+    using T3 = typename SubMaps::T3;
+
+    // Long strings are stored as StringRef along with saved hash
+    using Ts = typename SubMaps::Ts;
+    using Self = StringHashTable;
+
+    template <typename, typename, size_t>
+    friend class TwoLevelStringHashTable;
+
+    T0 m0;
+    T1 m1;
+    T2 m2;
+    T3 m3;
+    Ts ms;
+
+public:
+    using Key = StringRef;
+    using key_type = Key;
+    using mapped_type = typename Ts::mapped_type;
+    using value_type = typename Ts::value_type;
+    using cell_type = typename Ts::cell_type;
+
+    using LookupResult = StringHashTableLookupResult<typename cell_type::mapped_type>;
+    using ConstLookupResult = StringHashTableLookupResult<const typename cell_type::mapped_type>;
+
+    StringHashTable() = default;
+
+    StringHashTable(size_t reserve_for_num_elements)
+        : m1{reserve_for_num_elements / 4}
+        , m2{reserve_for_num_elements / 4}
+        , m3{reserve_for_num_elements / 4}
+        , ms{reserve_for_num_elements / 4}
+    {
+    }
+
+    StringHashTable(StringHashTable && rhs)
+        : m1(std::move(rhs.m1))
+        , m2(std::move(rhs.m2))
+        , m3(std::move(rhs.m3))
+        , ms(std::move(rhs.ms))
+    {
+    }
+
+    ~StringHashTable() = default;
+
+public:
+    // Dispatch is written in a way that maximizes the performance:
+    // 1. Always memcpy 8 times bytes
+    // 2. Use switch case extension to generate fast dispatching table
+    // 3. Funcs are named callables that can be force_inlined
+    //
+    // NOTE: It relies on Little Endianness
+    //
+    // NOTE: It requires padded to 8 bytes keys (IOW you cannot pass
+    // std::string here, but you can pass i.e. ColumnString::getDataAt()),
+    // since it copies 8 bytes at a time.
+    template <typename Self, typename KeyHolder, typename Func>
+    static auto ALWAYS_INLINE dispatch(Self & self, KeyHolder && key_holder, Func && func)
+    {
+        StringHashTableHash hash;
+        const StringRef & x = keyHolderGetKey(key_holder);
+        const size_t sz = x.size;
+        if (sz == 0)
+        {
+            keyHolderDiscardKey(key_holder);
+            return func(self.m0, VoidKey{}, 0);
+        }
+
+        if (x.data[sz - 1] == 0)
+        {
+            // Strings with trailing zeros are not representable as fixed-size
+            // string keys. Put them to the generic table.
+            return func(self.ms, std::forward<KeyHolder>(key_holder), hash(x));
+        }
+
+        const char * p = x.data;
+        // pending bits that needs to be shifted out
+        const char s = (-sz & 7) * 8;
+        union
+        {
+            StringKey8 k8;
+            StringKey16 k16;
+            StringKey24 k24;
+            UInt64 n[3];
+        };
+        switch ((sz - 1) >> 3)
+        {
+            case 0: // 1..8 bytes
+            {
+                // first half page
+                if ((reinterpret_cast<uintptr_t>(p) & 2048) == 0)
+                {
+                    memcpy(&n[0], p, 8);
+                    n[0] &= -1ul >> s;
+                }
+                else
+                {
+                    const char * lp = x.data + x.size - 8;
+                    memcpy(&n[0], lp, 8);
+                    n[0] >>= s;
+                }
+                keyHolderDiscardKey(key_holder);
+                return func(self.m1, k8, hash(k8));
+            }
+            case 1: // 9..16 bytes
+            {
+                memcpy(&n[0], p, 8);
+                const char * lp = x.data + x.size - 8;
+                memcpy(&n[1], lp, 8);
+                n[1] >>= s;
+                keyHolderDiscardKey(key_holder);
+                return func(self.m2, k16, hash(k16));
+            }
+            case 2: // 17..24 bytes
+            {
+                memcpy(&n[0], p, 16);
+                const char * lp = x.data + x.size - 8;
+                memcpy(&n[2], lp, 8);
+                n[2] >>= s;
+                keyHolderDiscardKey(key_holder);
+                return func(self.m3, k24, hash(k24));
+            }
+            default: // >= 25 bytes
+            {
+                return func(self.ms, std::forward<KeyHolder>(key_holder), hash(x));
+            }
+        }
+    }
+
+    struct EmplaceCallable
+    {
+        LookupResult & mapped;
+        bool & inserted;
+
+        EmplaceCallable(LookupResult & mapped_, bool & inserted_)
+            : mapped(mapped_), inserted(inserted_) {}
+
+        template <typename Map, typename KeyHolder>
+        void ALWAYS_INLINE operator()(Map & map, KeyHolder && key_holder, size_t hash)
+        {
+            typename Map::LookupResult result;
+            map.emplace(key_holder, result, inserted, hash);
+            mapped = &result->getMapped();
+        }
+    };
+
+    template <typename KeyHolder>
+    void ALWAYS_INLINE emplace(KeyHolder && key_holder, LookupResult & it, bool & inserted)
+    {
+        this->dispatch(*this, key_holder, EmplaceCallable(it, inserted));
+    }
+
+    struct FindCallable
+    {
+        // find() doesn't need any key memory management, so we don't work with
+        // any key holders here, only with normal keys. The key type is still
+        // different for every subtable, this is why it is a template parameter.
+        template <typename Submap, typename SubmapKey>
+        auto ALWAYS_INLINE operator()(Submap & map, const SubmapKey & key, size_t hash)
+        {
+            auto it = map.find(key, hash);
+            if (!it)
+                return decltype(&it->getMapped()){};
+            else
+                return &it->getMapped();
+        }
+    };
+
+    LookupResult ALWAYS_INLINE find(const Key & x)
+    {
+        return dispatch(*this, x, FindCallable{});
+    }
+
+    ConstLookupResult ALWAYS_INLINE find(const Key & x) const
+    {
+        return dispatch(*this, x, FindCallable{});
+    }
+
+    bool ALWAYS_INLINE has(const Key & x, size_t = 0) const
+    {
+        return dispatch(*this, x, FindCallable{}) != nullptr;
+    }
+#if 0
+    void write(DB::WriteBuffer & wb) const
+    {
+        m0.write(wb);
+        m1.write(wb);
+        m2.write(wb);
+        m3.write(wb);
+        ms.write(wb);
+    }
+
+    void writeText(DB::WriteBuffer & wb) const
+    {
+        m0.writeText(wb);
+        DB::writeChar(',', wb);
+        m1.writeText(wb);
+        DB::writeChar(',', wb);
+        m2.writeText(wb);
+        DB::writeChar(',', wb);
+        m3.writeText(wb);
+        DB::writeChar(',', wb);
+        ms.writeText(wb);
+    }
+
+    void read(DB::ReadBuffer & rb)
+    {
+        m0.read(rb);
+        m1.read(rb);
+        m2.read(rb);
+        m3.read(rb);
+        ms.read(rb);
+    }
+
+    void readText(DB::ReadBuffer & rb)
+    {
+        m0.readText(rb);
+        DB::assertChar(',', rb);
+        m1.readText(rb);
+        DB::assertChar(',', rb);
+        m2.readText(rb);
+        DB::assertChar(',', rb);
+        m3.readText(rb);
+        DB::assertChar(',', rb);
+        ms.readText(rb);
+    }
+#endif
+    size_t size() const { return m0.size() + m1.size() + m2.size() + m3.size() + ms.size(); }
+
+    bool empty() const { return m0.empty() && m1.empty() && m2.empty() && m3.empty() && ms.empty(); }
+
+    size_t getBufferSizeInBytes() const
+    {
+        return m0.getBufferSizeInBytes() + m1.getBufferSizeInBytes() + m2.getBufferSizeInBytes() + m3.getBufferSizeInBytes()
+            + ms.getBufferSizeInBytes();
+    }
+
+    void clearAndShrink()
+    {
+        m1.clearHasZero();
+        m1.clearAndShrink();
+        m2.clearAndShrink();
+        m3.clearAndShrink();
+        ms.clearAndShrink();
+    }
+};
+
+}
diff --git a/ydb/library/arrow_clickhouse/Common/PODArray.cpp b/ydb/library/arrow_clickhouse/Common/PODArray.cpp
new file mode 100644
index 00000000000..0840d8b7a01
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Common/PODArray.cpp
@@ -0,0 +1,23 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#include <Common/PODArray.h>
+
+namespace CH
+{
+
+/// Used for left padding of PODArray when empty
+const char empty_pod_array[empty_pod_array_size]{};
+
+template class PODArray<UInt8, 4096, Allocator<false>, 15, 16>;
+template class PODArray<UInt16, 4096, Allocator<false>, 15, 16>;
+template class PODArray<UInt32, 4096, Allocator<false>, 15, 16>;
+template class PODArray<UInt64, 4096, Allocator<false>, 15, 16>;
+
+template class PODArray<Int8, 4096, Allocator<false>, 15, 16>;
+template class PODArray<Int16, 4096, Allocator<false>, 15, 16>;
+template class PODArray<Int32, 4096, Allocator<false>, 15, 16>;
+template class PODArray<Int64, 4096, Allocator<false>, 15, 16>;
+
+}
diff --git a/ydb/library/arrow_clickhouse/Common/PODArray.h b/ydb/library/arrow_clickhouse/Common/PODArray.h
new file mode 100644
index 00000000000..45924b02225
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Common/PODArray.h
@@ -0,0 +1,803 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+#include "arrow_clickhouse_types.h"
+
+#include <string.h>
+#include <cstddef>
+#include <cassert>
+#include <algorithm>
+#include <memory>
+
+//#include <boost/noncopyable.hpp>
+
+#include <common/strong_typedef.h>
+
+#include <Common/Allocator.h>
+#include <Common/memcpySmall.h>
+
+#ifndef NDEBUG
+    #include <sys/mman.h>
+#endif
+
+#include <Common/PODArray_fwd.h>
+
+namespace CH
+{
+
+/** Whether we can use memcpy instead of a loop with assignment to T from U.
+  * It is Ok if types are the same. And if types are integral and of the same size,
+  *  example: char, signed char, unsigned char.
+  * It's not Ok for int and float.
+  * Don't forget to apply std::decay when using this constexpr.
+  */
+template <typename T, typename U>
+constexpr bool memcpy_can_be_used_for_assignment = std::is_same_v<T, U>
+    || (std::is_integral_v<T> && std::is_integral_v<U> && sizeof(T) == sizeof(U));
+
+
+/** A dynamic array for POD types.
+  * Designed for a small number of large arrays (rather than a lot of small ones).
+  * To be more precise - for use in ColumnVector.
+  * It differs from std::vector in that it does not initialize the elements.
+  *
+  * Made noncopyable so that there are no accidental copies. You can copy the data using `assign` method.
+  *
+  * Only part of the std::vector interface is supported.
+  *
+  * The default constructor creates an empty object that does not allocate memory.
+  * Then the memory is allocated at least initial_bytes bytes.
+  *
+  * If you insert elements with push_back, without making a `reserve`, then PODArray is about 2.5 times faster than std::vector.
+  *
+  * The template parameter `pad_right` - always allocate at the end of the array as many unused bytes.
+  * Can be used to make optimistic reading, writing, copying with unaligned SIMD instructions.
+  *
+  * The template parameter `pad_left` - always allocate memory before 0th element of the array (rounded up to the whole number of elements)
+  *  and zero initialize -1th element. It allows to use -1th element that will have value 0.
+  * This gives performance benefits when converting an array of offsets to array of sizes.
+  *
+  * Some methods using allocator have TAllocatorParams variadic arguments.
+  * These arguments will be passed to corresponding methods of TAllocator.
+  * Example: pointer to Arena, that is used for allocations.
+  *
+  * Why Allocator is not passed through constructor, as it is done in C++ standard library?
+  * Because sometimes we have many small objects, that share same allocator with same parameters,
+  *  and we must avoid larger object size due to storing the same parameters in each object.
+  * This is required for states of aggregate functions.
+  *
+  * TODO Pass alignment to Allocator.
+  * TODO Allow greater alignment than alignof(T). Example: array of char aligned to page size.
+  */
+static constexpr size_t empty_pod_array_size = 1024;
+extern const char empty_pod_array[empty_pod_array_size];
+
+/** Base class that depend only on size of element, not on element itself.
+  * You can static_cast to this class if you want to insert some data regardless to the actual type T.
+  */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wnull-dereference"
+
+inline size_t roundUpToPowerOfTwoOrZero(size_t n)
+{
+    // if MSB is set, return n, to avoid return zero
+    if (unlikely(n >= 0x8000000000000000ULL))
+        return n;
+
+    --n;
+    n |= n >> 1;
+    n |= n >> 2;
+    n |= n >> 4;
+    n |= n >> 8;
+    n |= n >> 16;
+    n |= n >> 32;
+    ++n;
+
+    return n;
+}
+
+template <size_t ELEMENT_SIZE, size_t initial_bytes, typename TAllocator, size_t pad_right_, size_t pad_left_>
+class PODArrayBase : /*private boost::noncopyable,*/ private TAllocator    /// empty base optimization
+{
+protected:
+    /// Round padding up to an whole number of elements to simplify arithmetic.
+    static constexpr size_t pad_right = integerRoundUp(pad_right_, ELEMENT_SIZE);
+    /// pad_left is also rounded up to 16 bytes to maintain alignment of allocated memory.
+    static constexpr size_t pad_left = integerRoundUp(integerRoundUp(pad_left_, ELEMENT_SIZE), 16);
+    /// Empty array will point to this static memory as padding and begin/end.
+    static constexpr char * null = const_cast<char *>(empty_pod_array) + pad_left;
+
+    static_assert(pad_left <= empty_pod_array_size && "Left Padding exceeds empty_pod_array_size. Is the element size too large?");
+
+    // If we are using allocator with inline memory, the minimal size of
+    // array must be in sync with the size of this memory.
+    static_assert(allocatorInitialBytes<TAllocator> == 0
+                  || allocatorInitialBytes<TAllocator> == initial_bytes);
+
+    char * c_start          = null;    /// Does not include pad_left.
+    char * c_end            = null;
+    char * c_end_of_storage = null;    /// Does not include pad_right.
+
+    /// The amount of memory occupied by the num_elements of the elements.
+    static size_t byte_size(size_t num_elements)
+    {
+        size_t amount;
+        if (__builtin_mul_overflow(num_elements, ELEMENT_SIZE, &amount))
+            throw Exception("Amount of memory requested to allocate is more than allowed");
+        return amount;
+    }
+
+    /// Minimum amount of memory to allocate for num_elements, including padding.
+    static size_t minimum_memory_for_elements(size_t num_elements) { return byte_size(num_elements) + pad_right + pad_left; }
+
+    void alloc_for_num_elements(size_t num_elements)
+    {
+        alloc(minimum_memory_for_elements(num_elements));
+    }
+
+    template <typename ... TAllocatorParams>
+    void alloc(size_t bytes, TAllocatorParams &&... allocator_params)
+    {
+        char * allocated = reinterpret_cast<char *>(TAllocator::alloc(bytes, std::forward<TAllocatorParams>(allocator_params)...));
+
+        c_start = allocated + pad_left;
+        c_end = c_start;
+        c_end_of_storage = allocated + bytes - pad_right;
+
+        if (pad_left)
+            memset(c_start - ELEMENT_SIZE, 0, ELEMENT_SIZE);
+    }
+
+    void dealloc()
+    {
+        if (c_start == null)
+            return;
+
+        unprotect();
+
+        TAllocator::free(c_start - pad_left, allocated_bytes());
+    }
+
+    template <typename ... TAllocatorParams>
+    void realloc(size_t bytes, TAllocatorParams &&... allocator_params)
+    {
+        if (c_start == null)
+        {
+            alloc(bytes, std::forward<TAllocatorParams>(allocator_params)...);
+            return;
+        }
+
+        unprotect();
+
+        ptrdiff_t end_diff = c_end - c_start;
+
+        char * allocated = reinterpret_cast<char *>(
+            TAllocator::realloc(c_start - pad_left, allocated_bytes(), bytes, std::forward<TAllocatorParams>(allocator_params)...));
+
+        c_start = allocated + pad_left;
+        c_end = c_start + end_diff;
+        c_end_of_storage = allocated + bytes - pad_right;
+    }
+
+    bool isInitialized() const
+    {
+        return (c_start != null) && (c_end != null) && (c_end_of_storage != null);
+    }
+
+    bool isAllocatedFromStack() const
+    {
+        static constexpr size_t stack_threshold = TAllocator::getStackThreshold();
+        return (stack_threshold > 0) && (allocated_bytes() <= stack_threshold);
+    }
+
+    template <typename ... TAllocatorParams>
+    void reserveForNextSize(TAllocatorParams &&... allocator_params)
+    {
+        if (empty())
+        {
+            // The allocated memory should be multiplication of ELEMENT_SIZE to hold the element, otherwise,
+            // memory issue such as corruption could appear in edge case.
+            realloc(std::max(integerRoundUp(initial_bytes, ELEMENT_SIZE),
+                             minimum_memory_for_elements(1)),
+                    std::forward<TAllocatorParams>(allocator_params)...);
+        }
+        else
+            realloc(allocated_bytes() * 2, std::forward<TAllocatorParams>(allocator_params)...);
+    }
+
+#ifndef NDEBUG
+    /// Make memory region readonly with mprotect if it is large enough.
+    /// The operation is slow and performed only for debug builds.
+    void protectImpl(int prot)
+    {
+        static constexpr size_t PROTECT_PAGE_SIZE = 4096;
+
+        char * left_rounded_up = reinterpret_cast<char *>((reinterpret_cast<intptr_t>(c_start) - pad_left + PROTECT_PAGE_SIZE - 1) / PROTECT_PAGE_SIZE * PROTECT_PAGE_SIZE);
+        char * right_rounded_down = reinterpret_cast<char *>((reinterpret_cast<intptr_t>(c_end_of_storage) + pad_right) / PROTECT_PAGE_SIZE * PROTECT_PAGE_SIZE);
+
+        if (right_rounded_down > left_rounded_up)
+        {
+            size_t length = right_rounded_down - left_rounded_up;
+            if (0 != mprotect(left_rounded_up, length, prot))
+                throw Exception("Cannot mprotect memory region");
+        }
+    }
+
+    /// Restore memory protection in destructor or realloc for further reuse by allocator.
+    bool mprotected = false;
+#endif
+
+public:
+    bool empty() const { return c_end == c_start; }
+    size_t size() const { return (c_end - c_start) / ELEMENT_SIZE; }
+    size_t capacity() const { return (c_end_of_storage - c_start) / ELEMENT_SIZE; }
+
+    /// This method is safe to use only for information about memory usage.
+    size_t allocated_bytes() const { return c_end_of_storage - c_start + pad_right + pad_left; }
+
+    void clear() { c_end = c_start; }
+
+    template <typename ... TAllocatorParams>
+#if defined(__clang__)
+    ALWAYS_INLINE /// Better performance in clang build, worse performance in gcc build.
+#endif
+    void reserve(size_t n, TAllocatorParams &&... allocator_params)
+    {
+        if (n > capacity())
+            realloc(roundUpToPowerOfTwoOrZero(minimum_memory_for_elements(n)), std::forward<TAllocatorParams>(allocator_params)...);
+    }
+
+    template <typename ... TAllocatorParams>
+    void reserve_exact(size_t n, TAllocatorParams &&... allocator_params)
+    {
+        if (n > capacity())
+            realloc(minimum_memory_for_elements(n), std::forward<TAllocatorParams>(allocator_params)...);
+    }
+
+    template <typename ... TAllocatorParams>
+    void resize(size_t n, TAllocatorParams &&... allocator_params)
+    {
+        reserve(n, std::forward<TAllocatorParams>(allocator_params)...);
+        resize_assume_reserved(n);
+    }
+
+    template <typename ... TAllocatorParams>
+    void resize_exact(size_t n, TAllocatorParams &&... allocator_params)
+    {
+        reserve_exact(n, std::forward<TAllocatorParams>(allocator_params)...);
+        resize_assume_reserved(n);
+    }
+
+    void resize_assume_reserved(const size_t n)
+    {
+        c_end = c_start + byte_size(n);
+    }
+
+    const char * raw_data() const
+    {
+        return c_start;
+    }
+
+    template <typename ... TAllocatorParams>
+    void push_back_raw(const void * ptr, TAllocatorParams &&... allocator_params)
+    {
+        size_t required_capacity = size() + ELEMENT_SIZE;
+        if (unlikely(required_capacity > capacity()))
+            reserve(required_capacity, std::forward<TAllocatorParams>(allocator_params)...);
+
+        memcpy(c_end, ptr, ELEMENT_SIZE);
+        c_end += ELEMENT_SIZE;
+    }
+
+    void protect()
+    {
+#ifndef NDEBUG
+        protectImpl(PROT_READ);
+        mprotected = true;
+#endif
+    }
+
+    void unprotect()
+    {
+#ifndef NDEBUG
+        if (mprotected)
+            protectImpl(PROT_WRITE);
+        mprotected = false;
+#endif
+    }
+
+    template <typename It1, typename It2>
+    inline void assertNotIntersects(It1 from_begin [[maybe_unused]], It2 from_end [[maybe_unused]])
+    {
+#if !defined(NDEBUG)
+        const char * ptr_begin = reinterpret_cast<const char *>(&*from_begin);
+        const char * ptr_end = reinterpret_cast<const char *>(&*from_end);
+
+        /// Also it's safe if the range is empty.
+        assert(!((ptr_begin >= c_start && ptr_begin < c_end) || (ptr_end > c_start && ptr_end <= c_end)) || (ptr_begin == ptr_end));
+#endif
+    }
+
+    ~PODArrayBase()
+    {
+        dealloc();
+    }
+};
+
+template <typename T, size_t initial_bytes, typename TAllocator, size_t pad_right_, size_t pad_left_>
+class PODArray : public PODArrayBase<sizeof(T), initial_bytes, TAllocator, pad_right_, pad_left_>
+{
+protected:
+    using Base = PODArrayBase<sizeof(T), initial_bytes, TAllocator, pad_right_, pad_left_>;
+
+    T * t_start()                      { return reinterpret_cast<T *>(this->c_start); }
+    T * t_end()                        { return reinterpret_cast<T *>(this->c_end); }
+
+    const T * t_start() const          { return reinterpret_cast<const T *>(this->c_start); }
+    const T * t_end() const            { return reinterpret_cast<const T *>(this->c_end); }
+
+public:
+    using value_type = T;
+
+    /// We cannot use boost::iterator_adaptor, because it defeats loop vectorization,
+    ///  see https://github.com/ClickHouse/ClickHouse/pull/9442
+
+    using iterator = T *;
+    using const_iterator = const T *;
+
+
+    PODArray() = default;
+
+    PODArray(size_t n)
+    {
+        this->alloc_for_num_elements(n);
+        this->c_end += this->byte_size(n);
+    }
+
+    PODArray(size_t n, const T & x)
+    {
+        this->alloc_for_num_elements(n);
+        assign(n, x);
+    }
+
+    PODArray(const_iterator from_begin, const_iterator from_end)
+    {
+        this->alloc_for_num_elements(from_end - from_begin);
+        insert(from_begin, from_end);
+    }
+
+    PODArray(std::initializer_list<T> il)
+    {
+        this->reserve(std::size(il));
+
+        for (const auto & x : il)
+        {
+            this->push_back(x);
+        }
+    }
+
+    PODArray(PODArray && other)
+    {
+        this->swap(other);
+    }
+
+    PODArray & operator=(PODArray && other)
+    {
+        this->swap(other);
+        return *this;
+    }
+
+    T * data() { return t_start(); }
+    const T * data() const { return t_start(); }
+
+    /// The index is signed to access -1th element without pointer overflow.
+    T & operator[] (ssize_t n)
+    {
+        /// <= size, because taking address of one element past memory range is Ok in C++ (expression like &arr[arr.size()] is perfectly valid).
+        assert((n >= (static_cast<ssize_t>(pad_left_) ? -1 : 0)) && (n <= static_cast<ssize_t>(this->size())));
+        return t_start()[n];
+    }
+
+    const T & operator[] (ssize_t n) const
+    {
+        assert((n >= (static_cast<ssize_t>(pad_left_) ? -1 : 0)) && (n <= static_cast<ssize_t>(this->size())));
+        return t_start()[n];
+    }
+
+    T & front()             { return t_start()[0]; }
+    T & back()              { return t_end()[-1]; }
+    const T & front() const { return t_start()[0]; }
+    const T & back() const  { return t_end()[-1]; }
+
+    iterator begin()              { return t_start(); }
+    iterator end()                { return t_end(); }
+    const_iterator begin() const  { return t_start(); }
+    const_iterator end() const    { return t_end(); }
+    const_iterator cbegin() const { return t_start(); }
+    const_iterator cend() const   { return t_end(); }
+
+    /// Same as resize, but zeroes new elements.
+    void resize_fill(size_t n)
+    {
+        size_t old_size = this->size();
+        if (n > old_size)
+        {
+            this->reserve(n);
+            memset(this->c_end, 0, this->byte_size(n - old_size));
+        }
+        this->c_end = this->c_start + this->byte_size(n);
+    }
+
+    void resize_fill(size_t n, const T & value)
+    {
+        size_t old_size = this->size();
+        if (n > old_size)
+        {
+            this->reserve(n);
+            std::fill(t_end(), t_end() + n - old_size, value);
+        }
+        this->c_end = this->c_start + this->byte_size(n);
+    }
+
+    template <typename U, typename ... TAllocatorParams>
+    void push_back(U && x, TAllocatorParams &&... allocator_params)
+    {
+        if (unlikely(this->c_end + sizeof(T) > this->c_end_of_storage))
+            this->reserveForNextSize(std::forward<TAllocatorParams>(allocator_params)...);
+
+        new (t_end()) T(std::forward<U>(x));
+        this->c_end += this->byte_size(1);
+    }
+
+    /** This method doesn't allow to pass parameters for Allocator,
+      *  and it couldn't be used if Allocator requires custom parameters.
+      */
+    template <typename... Args>
+    void emplace_back(Args &&... args)
+    {
+        if (unlikely(this->c_end + sizeof(T) > this->c_end_of_storage))
+            this->reserveForNextSize();
+
+        new (t_end()) T(std::forward<Args>(args)...);
+        this->c_end += this->byte_size(1);
+    }
+
+    void pop_back()
+    {
+        this->c_end -= this->byte_size(1);
+    }
+
+    /// Do not insert into the array a piece of itself. Because with the resize, the iterators on themselves can be invalidated.
+    template <typename It1, typename It2, typename ... TAllocatorParams>
+    void insertPrepare(It1 from_begin, It2 from_end, TAllocatorParams &&... allocator_params)
+    {
+        this->assertNotIntersects(from_begin, from_end);
+        size_t required_capacity = this->size() + (from_end - from_begin);
+        if (required_capacity > this->capacity())
+            this->reserve(roundUpToPowerOfTwoOrZero(required_capacity), std::forward<TAllocatorParams>(allocator_params)...);
+    }
+
+    /// Do not insert into the array a piece of itself. Because with the resize, the iterators on themselves can be invalidated.
+    template <typename It1, typename It2, typename ... TAllocatorParams>
+    void insert(It1 from_begin, It2 from_end, TAllocatorParams &&... allocator_params)
+    {
+        insertPrepare(from_begin, from_end, std::forward<TAllocatorParams>(allocator_params)...);
+        insert_assume_reserved(from_begin, from_end);
+    }
+
+    /// In contrast to 'insert' this method is Ok even for inserting from itself.
+    /// Because we obtain iterators after reserving memory.
+    template <typename Container, typename ... TAllocatorParams>
+    void insertByOffsets(Container && rhs, size_t from_begin, size_t from_end, TAllocatorParams &&... allocator_params)
+    {
+        static_assert(memcpy_can_be_used_for_assignment<std::decay_t<T>, std::decay_t<decltype(rhs.front())>>);
+
+        assert(from_end >= from_begin);
+        assert(from_end <= rhs.size());
+
+        size_t required_capacity = this->size() + (from_end - from_begin);
+        if (required_capacity > this->capacity())
+            this->reserve(roundUpToPowerOfTwoOrZero(required_capacity), std::forward<TAllocatorParams>(allocator_params)...);
+
+        size_t bytes_to_copy = this->byte_size(from_end - from_begin);
+        if (bytes_to_copy)
+        {
+            memcpy(this->c_end, reinterpret_cast<const void *>(rhs.begin() + from_begin), bytes_to_copy);
+            this->c_end += bytes_to_copy;
+        }
+    }
+
+    /// Works under assumption, that it's possible to read up to 15 excessive bytes after `from_end` and this PODArray is padded.
+    template <typename It1, typename It2, typename ... TAllocatorParams>
+    void insertSmallAllowReadWriteOverflow15(It1 from_begin, It2 from_end, TAllocatorParams &&... allocator_params)
+    {
+        static_assert(pad_right_ >= 15);
+        static_assert(sizeof(T) == sizeof(*from_begin));
+        insertPrepare(from_begin, from_end, std::forward<TAllocatorParams>(allocator_params)...);
+        size_t bytes_to_copy = this->byte_size(from_end - from_begin);
+        memcpySmallAllowReadWriteOverflow15(this->c_end, reinterpret_cast<const void *>(&*from_begin), bytes_to_copy);
+        this->c_end += bytes_to_copy;
+    }
+
+    /// Do not insert into the array a piece of itself. Because with the resize, the iterators on themselves can be invalidated.
+    template <typename It1, typename It2>
+    void insert(iterator it, It1 from_begin, It2 from_end)
+    {
+        static_assert(memcpy_can_be_used_for_assignment<std::decay_t<T>, std::decay_t<decltype(*from_begin)>>);
+
+        size_t bytes_to_copy = this->byte_size(from_end - from_begin);
+        if (!bytes_to_copy)
+            return;
+
+        size_t bytes_to_move = this->byte_size(end() - it);
+
+        insertPrepare(from_begin, from_end);
+
+        if (unlikely(bytes_to_move))
+            memmove(this->c_end + bytes_to_copy - bytes_to_move, this->c_end - bytes_to_move, bytes_to_move);
+
+        memcpy(this->c_end - bytes_to_move, reinterpret_cast<const void *>(&*from_begin), bytes_to_copy);
+
+        this->c_end += bytes_to_copy;
+    }
+
+    template <typename ... TAllocatorParams>
+    void insertFromItself(iterator from_begin, iterator from_end, TAllocatorParams && ... allocator_params)
+    {
+        static_assert(memcpy_can_be_used_for_assignment<std::decay_t<T>, std::decay_t<decltype(*from_begin)>>);
+
+        /// Convert iterators to indexes because reserve can invalidate iterators
+        size_t start_index = from_begin - begin();
+        size_t end_index = from_end - begin();
+        size_t copy_size = end_index - start_index;
+
+        assert(start_index <= end_index);
+
+        size_t required_capacity = this->size() + copy_size;
+        if (required_capacity > this->capacity())
+            this->reserve(roundUpToPowerOfTwoOrZero(required_capacity), std::forward<TAllocatorParams>(allocator_params)...);
+
+        size_t bytes_to_copy = this->byte_size(copy_size);
+        if (bytes_to_copy)
+        {
+            auto begin = this->c_start + this->byte_size(start_index);
+            memcpy(this->c_end, reinterpret_cast<const void *>(&*begin), bytes_to_copy);
+            this->c_end += bytes_to_copy;
+        }
+    }
+
+    template <typename It1, typename It2>
+    void insert_assume_reserved(It1 from_begin, It2 from_end)
+    {
+        static_assert(memcpy_can_be_used_for_assignment<std::decay_t<T>, std::decay_t<decltype(*from_begin)>>);
+        this->assertNotIntersects(from_begin, from_end);
+
+        size_t bytes_to_copy = this->byte_size(from_end - from_begin);
+        if (bytes_to_copy)
+        {
+            memcpy(this->c_end, reinterpret_cast<const void *>(&*from_begin), bytes_to_copy);
+            this->c_end += bytes_to_copy;
+        }
+    }
+
+    template <typename... TAllocatorParams>
+    void swap(PODArray & rhs, TAllocatorParams &&... allocator_params)
+    {
+#ifndef NDEBUG
+        this->unprotect();
+        rhs.unprotect();
+#endif
+
+        /// Swap two PODArray objects, arr1 and arr2, that satisfy the following conditions:
+        /// - The elements of arr1 are stored on stack.
+        /// - The elements of arr2 are stored on heap.
+        auto swap_stack_heap = [&](PODArray & arr1, PODArray & arr2)
+        {
+            size_t stack_size = arr1.size();
+            size_t stack_allocated = arr1.allocated_bytes();
+
+            size_t heap_size = arr2.size();
+            size_t heap_allocated = arr2.allocated_bytes();
+
+            /// Keep track of the stack content we have to copy.
+            char * stack_c_start = arr1.c_start;
+
+            /// arr1 takes ownership of the heap memory of arr2.
+            arr1.c_start = arr2.c_start;
+            arr1.c_end_of_storage = arr1.c_start + heap_allocated - arr2.pad_right - arr2.pad_left;
+            arr1.c_end = arr1.c_start + this->byte_size(heap_size);
+
+            /// Allocate stack space for arr2.
+            arr2.alloc(stack_allocated, std::forward<TAllocatorParams>(allocator_params)...);
+            /// Copy the stack content.
+            memcpy(arr2.c_start, stack_c_start, this->byte_size(stack_size));
+            arr2.c_end = arr2.c_start + this->byte_size(stack_size);
+        };
+
+        auto do_move = [&](PODArray & src, PODArray & dest)
+        {
+            if (src.isAllocatedFromStack())
+            {
+                dest.dealloc();
+                dest.alloc(src.allocated_bytes(), std::forward<TAllocatorParams>(allocator_params)...);
+                memcpy(dest.c_start, src.c_start, this->byte_size(src.size()));
+                dest.c_end = dest.c_start + this->byte_size(src.size());
+
+                src.c_start = Base::null;
+                src.c_end = Base::null;
+                src.c_end_of_storage = Base::null;
+            }
+            else
+            {
+                std::swap(dest.c_start, src.c_start);
+                std::swap(dest.c_end, src.c_end);
+                std::swap(dest.c_end_of_storage, src.c_end_of_storage);
+            }
+        };
+
+        if (!this->isInitialized() && !rhs.isInitialized())
+        {
+            return;
+        }
+        else if (!this->isInitialized() && rhs.isInitialized())
+        {
+            do_move(rhs, *this);
+            return;
+        }
+        else if (this->isInitialized() && !rhs.isInitialized())
+        {
+            do_move(*this, rhs);
+            return;
+        }
+
+        if (this->isAllocatedFromStack() && rhs.isAllocatedFromStack())
+        {
+            size_t min_size = std::min(this->size(), rhs.size());
+            size_t max_size = std::max(this->size(), rhs.size());
+
+            for (size_t i = 0; i < min_size; ++i)
+                std::swap(this->operator[](i), rhs[i]);
+
+            if (this->size() == max_size)
+            {
+                for (size_t i = min_size; i < max_size; ++i)
+                    rhs[i] = this->operator[](i);
+            }
+            else
+            {
+                for (size_t i = min_size; i < max_size; ++i)
+                    this->operator[](i) = rhs[i];
+            }
+
+            size_t lhs_size = this->size();
+            size_t lhs_allocated = this->allocated_bytes();
+
+            size_t rhs_size = rhs.size();
+            size_t rhs_allocated = rhs.allocated_bytes();
+
+            this->c_end_of_storage = this->c_start + rhs_allocated - Base::pad_right - Base::pad_left;
+            rhs.c_end_of_storage = rhs.c_start + lhs_allocated - Base::pad_right - Base::pad_left;
+
+            this->c_end = this->c_start + this->byte_size(rhs_size);
+            rhs.c_end = rhs.c_start + this->byte_size(lhs_size);
+        }
+        else if (this->isAllocatedFromStack() && !rhs.isAllocatedFromStack())
+        {
+            swap_stack_heap(*this, rhs);
+        }
+        else if (!this->isAllocatedFromStack() && rhs.isAllocatedFromStack())
+        {
+            swap_stack_heap(rhs, *this);
+        }
+        else
+        {
+            std::swap(this->c_start, rhs.c_start);
+            std::swap(this->c_end, rhs.c_end);
+            std::swap(this->c_end_of_storage, rhs.c_end_of_storage);
+        }
+    }
+
+    template <typename... TAllocatorParams>
+    void assign(size_t n, const T & x, TAllocatorParams &&... allocator_params)
+    {
+        this->resize_exact(n, std::forward<TAllocatorParams>(allocator_params)...);
+        std::fill(begin(), end(), x);
+    }
+
+    template <typename It1, typename It2, typename... TAllocatorParams>
+    void assign(It1 from_begin, It2 from_end, TAllocatorParams &&... allocator_params)
+    {
+        static_assert(memcpy_can_be_used_for_assignment<std::decay_t<T>, std::decay_t<decltype(*from_begin)>>);
+        this->assertNotIntersects(from_begin, from_end);
+
+        size_t required_capacity = from_end - from_begin;
+        if (required_capacity > this->capacity())
+            this->reserve_exact(required_capacity, std::forward<TAllocatorParams>(allocator_params)...);
+
+        size_t bytes_to_copy = this->byte_size(required_capacity);
+        if (bytes_to_copy)
+        {
+            memcpy(this->c_start, reinterpret_cast<const void *>(&*from_begin), bytes_to_copy);
+            this->c_end = this->c_start + bytes_to_copy;
+        }
+    }
+
+    // ISO C++ has strict ambiguity rules, thus we cannot apply TAllocatorParams here.
+    void assign(const PODArray & from)
+    {
+        assign(from.begin(), from.end());
+    }
+
+    void erase(const_iterator first, const_iterator last)
+    {
+        iterator first_no_const = const_cast<iterator>(first);
+        iterator last_no_const = const_cast<iterator>(last);
+
+        size_t items_to_move = end() - last;
+
+        while (items_to_move != 0)
+        {
+            *first_no_const = *last_no_const;
+
+            ++first_no_const;
+            ++last_no_const;
+
+            --items_to_move;
+        }
+
+        this->c_end = reinterpret_cast<char *>(first_no_const);
+    }
+
+    void erase(const_iterator pos)
+    {
+        this->erase(pos, pos + 1);
+    }
+
+    bool operator== (const PODArray & rhs) const
+    {
+        if (this->size() != rhs.size())
+            return false;
+
+        const_iterator lhs_it = begin();
+        const_iterator rhs_it = rhs.begin();
+
+        while (lhs_it != end())
+        {
+            if (*lhs_it != *rhs_it)
+                return false;
+
+            ++lhs_it;
+            ++rhs_it;
+        }
+
+        return true;
+    }
+
+    bool operator!= (const PODArray & rhs) const
+    {
+        return !operator==(rhs);
+    }
+};
+
+template <typename T, size_t initial_bytes, typename TAllocator, size_t pad_right_, size_t pad_left_>
+void swap(PODArray<T, initial_bytes, TAllocator, pad_right_, pad_left_> & lhs, PODArray<T, initial_bytes, TAllocator, pad_right_, pad_left_> & rhs)
+{
+    lhs.swap(rhs);
+}
+#pragma GCC diagnostic pop
+
+/// Prevent implicit template instantiation of PODArray for common numeric types
+
+extern template class PODArray<UInt8, 4096, Allocator<false>, 15, 16>;
+extern template class PODArray<UInt16, 4096, Allocator<false>, 15, 16>;
+extern template class PODArray<UInt32, 4096, Allocator<false>, 15, 16>;
+extern template class PODArray<UInt64, 4096, Allocator<false>, 15, 16>;
+
+extern template class PODArray<Int8, 4096, Allocator<false>, 15, 16>;
+extern template class PODArray<Int16, 4096, Allocator<false>, 15, 16>;
+extern template class PODArray<Int32, 4096, Allocator<false>, 15, 16>;
+extern template class PODArray<Int64, 4096, Allocator<false>, 15, 16>;
+
+}
diff --git a/ydb/library/arrow_clickhouse/Common/PODArray_fwd.h b/ydb/library/arrow_clickhouse/Common/PODArray_fwd.h
new file mode 100644
index 00000000000..8be2d0590a0
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Common/PODArray_fwd.h
@@ -0,0 +1,40 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+/**
+  * This file contains some using-declarations that define various kinds of
+  * PODArray.
+  */
+
+#include <common/types.h>
+#include <Common/Allocator_fwd.h>
+
+namespace CH
+{
+
+inline constexpr size_t integerRoundUp(size_t value, size_t dividend)
+{
+    return ((value + dividend - 1) / dividend) * dividend;
+}
+
+template <typename T, size_t initial_bytes = 4096,
+          typename TAllocator = Allocator<false>, size_t pad_right_ = 0,
+          size_t pad_left_ = 0>
+class PODArray;
+
+/** For columns. Padding is enough to read and write xmm-register at the address of the last element. */
+template <typename T, size_t initial_bytes = 4096, typename TAllocator = Allocator<false>>
+using PaddedPODArray = PODArray<T, initial_bytes, TAllocator, 15, 16>;
+
+/** A helper for declaring PODArray that uses inline memory.
+  * The initial size is set to use all the inline bytes, since using less would
+  * only add some extra allocation calls.
+  */
+template <typename T, size_t inline_bytes,
+          size_t rounded_bytes = integerRoundUp(inline_bytes, sizeof(T))>
+using PODArrayWithStackMemory = PODArray<T, rounded_bytes,
+    AllocatorWithStackMemory<Allocator<false>, rounded_bytes, alignof(T)>>;
+
+}
diff --git a/ydb/library/arrow_clickhouse/Common/SipHash.h b/ydb/library/arrow_clickhouse/Common/SipHash.h
new file mode 100644
index 00000000000..ac9c21c7d90
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Common/SipHash.h
@@ -0,0 +1,219 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+#include "arrow_clickhouse_types.h"
+
+/** SipHash is a fast cryptographic hash function for short strings.
+  * Taken from here: https://www.131002.net/siphash/
+  *
+  * This is SipHash 2-4 variant.
+  *
+  * Two changes are made:
+  * - returns also 128 bits, not only 64;
+  * - done streaming (can be calculated in parts).
+  *
+  * On short strings (URL, search phrases) more than 3 times faster than MD5 from OpenSSL.
+  * (~ 700 MB/sec, 15 million strings per second)
+  */
+
+#include <common/types.h>
+#include <common/unaligned.h>
+#include <string>
+#include <type_traits>
+#include <cstddef>
+
+namespace CH
+{
+
+#define ROTL(x, b) static_cast<UInt64>(((x) << (b)) | ((x) >> (64 - (b))))
+
+#define SIPROUND                                                  \
+    do                                                            \
+    {                                                             \
+        v0 += v1; v1 = ROTL(v1, 13); v1 ^= v0; v0 = ROTL(v0, 32); \
+        v2 += v3; v3 = ROTL(v3, 16); v3 ^= v2;                    \
+        v0 += v3; v3 = ROTL(v3, 21); v3 ^= v0;                    \
+        v2 += v1; v1 = ROTL(v1, 17); v1 ^= v2; v2 = ROTL(v2, 32); \
+    } while(0)
+
+
+class SipHash
+{
+private:
+    /// State.
+    UInt64 v0;
+    UInt64 v1;
+    UInt64 v2;
+    UInt64 v3;
+
+    /// How many bytes have been processed.
+    UInt64 cnt;
+
+    /// The current 8 bytes of input data.
+    union
+    {
+        UInt64 current_word;
+        UInt8 current_bytes[8];
+    };
+
+    ALWAYS_INLINE void finalize()
+    {
+        /// In the last free byte, we write the remainder of the division by 256.
+        current_bytes[7] = cnt;
+
+        v3 ^= current_word;
+        SIPROUND;
+        SIPROUND;
+        v0 ^= current_word;
+
+        v2 ^= 0xff;
+        SIPROUND;
+        SIPROUND;
+        SIPROUND;
+        SIPROUND;
+    }
+
+public:
+    /// Arguments - seed.
+    SipHash(UInt64 k0 = 0, UInt64 k1 = 0)
+    {
+        /// Initialize the state with some random bytes and seed.
+        v0 = 0x736f6d6570736575ULL ^ k0;
+        v1 = 0x646f72616e646f6dULL ^ k1;
+        v2 = 0x6c7967656e657261ULL ^ k0;
+        v3 = 0x7465646279746573ULL ^ k1;
+
+        cnt = 0;
+        current_word = 0;
+    }
+
+    void update(const char * data, UInt64 size)
+    {
+        const char * end = data + size;
+
+        /// We'll finish to process the remainder of the previous update, if any.
+        if (cnt & 7)
+        {
+            while (cnt & 7 && data < end)
+            {
+                current_bytes[cnt & 7] = *data;
+                ++data;
+                ++cnt;
+            }
+
+            /// If we still do not have enough bytes to an 8-byte word.
+            if (cnt & 7)
+                return;
+
+            v3 ^= current_word;
+            SIPROUND;
+            SIPROUND;
+            v0 ^= current_word;
+        }
+
+        cnt += end - data;
+
+        while (data + 8 <= end)
+        {
+            current_word = unalignedLoad<UInt64>(data);
+
+            v3 ^= current_word;
+            SIPROUND;
+            SIPROUND;
+            v0 ^= current_word;
+
+            data += 8;
+        }
+
+        /// Pad the remainder, which is missing up to an 8-byte word.
+        current_word = 0;
+        switch (end - data)
+        {
+            case 7: current_bytes[6] = data[6]; [[fallthrough]];
+            case 6: current_bytes[5] = data[5]; [[fallthrough]];
+            case 5: current_bytes[4] = data[4]; [[fallthrough]];
+            case 4: current_bytes[3] = data[3]; [[fallthrough]];
+            case 3: current_bytes[2] = data[2]; [[fallthrough]];
+            case 2: current_bytes[1] = data[1]; [[fallthrough]];
+            case 1: current_bytes[0] = data[0]; [[fallthrough]];
+            case 0: break;
+        }
+    }
+
+    template <typename T>
+    void update(const T & x)
+    {
+        update(reinterpret_cast<const char *>(&x), sizeof(x));
+    }
+
+    void update(const std::string & x)
+    {
+        update(x.data(), x.length());
+    }
+
+    /// Get the result in some form. This can only be done once!
+
+    void get128(char * out)
+    {
+        finalize();
+        unalignedStore<UInt64>(out, v0 ^ v1);
+        unalignedStore<UInt64>(out + 8, v2 ^ v3);
+    }
+
+    template <typename T>
+    ALWAYS_INLINE void get128(T & lo, T & hi)
+    {
+        static_assert(sizeof(T) == 8);
+        finalize();
+        lo = v0 ^ v1;
+        hi = v2 ^ v3;
+    }
+
+    template <typename T>
+    ALWAYS_INLINE void get128(T & dst)
+    {
+        static_assert(sizeof(T) == 16);
+        get128(reinterpret_cast<char *>(&dst));
+    }
+
+    UInt64 get64()
+    {
+        finalize();
+        return v0 ^ v1 ^ v2 ^ v3;
+    }
+};
+
+
+#undef ROTL
+#undef SIPROUND
+
+inline void sipHash128(const char * data, const size_t size, char * out)
+{
+    SipHash hash;
+    hash.update(data, size);
+    hash.get128(out);
+}
+
+inline UInt64 sipHash64(const char * data, const size_t size)
+{
+    SipHash hash;
+    hash.update(data, size);
+    return hash.get64();
+}
+
+template <typename T>
+UInt64 sipHash64(const T & x)
+{
+    SipHash hash;
+    hash.update(x);
+    return hash.get64();
+}
+
+inline UInt64 sipHash64(const std::string & s)
+{
+    return sipHash64(s.data(), s.size());
+}
+
+}
diff --git a/ydb/library/arrow_clickhouse/Common/memcpySmall.h b/ydb/library/arrow_clickhouse/Common/memcpySmall.h
new file mode 100644
index 00000000000..113342a4d7d
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/Common/memcpySmall.h
@@ -0,0 +1,78 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+
+#include <string.h>
+
+#ifdef __SSE2__
+#include <emmintrin.h>
+
+namespace CH
+{
+
+/** memcpy function could work suboptimal if all the following conditions are met:
+  * 1. Size of memory region is relatively small (approximately, under 50 bytes).
+  * 2. Size of memory region is not known at compile-time.
+  *
+  * In that case, memcpy works suboptimal by following reasons:
+  * 1. Function is not inlined.
+  * 2. Much time/instructions are spend to process "tails" of data.
+  *
+  * There are cases when function could be implemented in more optimal way, with help of some assumptions.
+  * One of that assumptions - ability to read and write some number of bytes after end of passed memory regions.
+  * Under that assumption, it is possible not to implement difficult code to process tails of data and do copy always by big chunks.
+  *
+  * This case is typical, for example, when many small pieces of data are gathered to single contiguous piece of memory in a loop.
+  * - because each next copy will overwrite excessive data after previous copy.
+  *
+  * Assumption that size of memory region is small enough allows us to not unroll the loop.
+  * This is slower, when size of memory is actually big.
+  *
+  * Use with caution.
+  */
+
+namespace detail
+{
+    inline void memcpySmallAllowReadWriteOverflow15Impl(char * __restrict dst, const char * __restrict src, ssize_t n)
+    {
+        while (n > 0)
+        {
+            _mm_storeu_si128(reinterpret_cast<__m128i *>(dst),
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
+
+            dst += 16;
+            src += 16;
+            n -= 16;
+        }
+    }
+}
+
+/** Works under assumption, that it's possible to read up to 15 excessive bytes after end of 'src' region
+  *  and to write any garbage into up to 15 bytes after end of 'dst' region.
+  */
+inline void memcpySmallAllowReadWriteOverflow15(void * __restrict dst, const void * __restrict src, size_t n)
+{
+    detail::memcpySmallAllowReadWriteOverflow15Impl(reinterpret_cast<char *>(dst), reinterpret_cast<const char *>(src), n);
+}
+
+/** NOTE There was also a function, that assumes, that you could read any bytes inside same memory page of src.
+  * This function was unused, and also it requires special handling for Valgrind and ASan.
+  */
+
+}
+
+#else    /// Implementation for other platforms.
+
+namespace CH
+{
+
+inline void memcpySmallAllowReadWriteOverflow15(void * __restrict dst, const void * __restrict src, size_t n)
+{
+    memcpy(dst, src, n);
+}
+
+}
+
+#endif
diff --git a/ydb/library/arrow_clickhouse/DataStreams/AggregatingBlockInputStream.cpp b/ydb/library/arrow_clickhouse/DataStreams/AggregatingBlockInputStream.cpp
new file mode 100644
index 00000000000..b23b70eeb8e
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/DataStreams/AggregatingBlockInputStream.cpp
@@ -0,0 +1,127 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#include <DataStreams/AggregatingBlockInputStream.h>
+#include <DataStreams/OneBlockInputStream.h>
+
+namespace CH
+{
+
+/** Combines aggregation states together, turns them into blocks, and outputs streams.
+  * If the aggregation states are two-level, then it produces blocks strictly in order of 'bucket_num'.
+  * (This is important for distributed processing.)
+  * In doing so, it can handle different buckets in parallel, using up to `threads` threads.
+  */
+class MergingAndConvertingBlockInputStream : public IBlockInputStream
+{
+public:
+    /** The input is a set of non-empty sets of partially aggregated data,
+      *  which are all either single-level, or are two-level.
+      */
+    MergingAndConvertingBlockInputStream(const Aggregator & aggregator_, ManyAggregatedDataVariants & data_, bool final_)
+        : aggregator(aggregator_), data(data_), final(final_), threads(1)
+    {
+        /// At least we need one arena in first data item per thread
+        if (!data.empty() && threads > data[0]->aggregates_pools.size())
+        {
+            Arenas & first_pool = data[0]->aggregates_pools;
+            for (size_t j = first_pool.size(); j < threads; j++)
+                first_pool.emplace_back(std::make_shared<Arena>());
+        }
+    }
+
+    String getName() const override { return "MergingAndConverting"; }
+
+    Header getHeader() const override { return aggregator.getHeader(final); }
+
+protected:
+    Block readImpl() override
+    {
+        if (data.empty())
+            return {};
+
+        if (current_bucket_num >= NUM_BUCKETS)
+            return {};
+
+        AggregatedDataVariantsPtr & first = data[0];
+
+        if (current_bucket_num == -1)
+        {
+            ++current_bucket_num;
+
+            if (first->type == AggregatedDataVariants::Type::without_key || aggregator.params.overflow_row)
+            {
+                aggregator.mergeWithoutKeyDataImpl(data);
+                return aggregator.prepareBlockAndFillWithoutKey(
+                    *first, final, first->type != AggregatedDataVariants::Type::without_key);
+            }
+        }
+
+        {
+            if (current_bucket_num > 0)
+                return {};
+
+            if (first->type == AggregatedDataVariants::Type::without_key)
+                return {};
+
+            ++current_bucket_num;
+
+        #define M(NAME) \
+            else if (first->type == AggregatedDataVariants::Type::NAME) \
+                aggregator.mergeSingleLevelDataImpl<decltype(first->NAME)::element_type>(data);
+            if (false) {} // NOLINT
+            APPLY_FOR_AGGREGATED_VARIANTS(M)
+        #undef M
+            else
+                throw Exception("Unknown aggregated data variant.");
+
+            return aggregator.prepareBlockAndFillSingleLevel(*first, final);
+        }
+    }
+
+private:
+    const Aggregator & aggregator;
+    ManyAggregatedDataVariants data;
+    bool final;
+    size_t threads;
+
+    Int32 current_bucket_num = -1;
+    static constexpr Int32 NUM_BUCKETS = 256;
+};
+
+static std::unique_ptr<IBlockInputStream> mergeAndConvertToBlocks(Aggregator & aggregator,
+                                                                  ManyAggregatedDataVariants & data_variants,
+                                                                  bool final)
+{
+    ManyAggregatedDataVariants non_empty_data = aggregator.prepareVariantsToMerge(data_variants);
+    if (non_empty_data.empty())
+        return std::make_unique<OneBlockInputStream>(blockFromHeader(aggregator.getHeader(final)));
+    return std::make_unique<MergingAndConvertingBlockInputStream>(aggregator, non_empty_data, final);
+}
+
+Header AggregatingBlockInputStream::getHeader() const
+{
+    return aggregator.getHeader(final);
+}
+
+Block AggregatingBlockInputStream::readImpl()
+{
+    if (!executed)
+    {
+        executed = true;
+        AggregatedDataVariantsPtr data_variants = std::make_shared<AggregatedDataVariants>();
+
+        aggregator.execute(children.back(), *data_variants);
+
+        ManyAggregatedDataVariants many_data { data_variants };
+        impl = mergeAndConvertToBlocks(aggregator, many_data, final);
+    }
+
+    if (isCancelledOrThrowIfKilled() || !impl)
+        return {};
+
+    return impl->read();
+}
+
+}
diff --git a/ydb/library/arrow_clickhouse/DataStreams/AggregatingBlockInputStream.h b/ydb/library/arrow_clickhouse/DataStreams/AggregatingBlockInputStream.h
new file mode 100644
index 00000000000..e127c0a7b5f
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/DataStreams/AggregatingBlockInputStream.h
@@ -0,0 +1,49 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+#include "arrow_clickhouse_types.h"
+
+#include "Aggregator.h"
+#include <DataStreams/IBlockStream_fwd.h>
+#include <DataStreams/IBlockInputStream.h>
+
+namespace CH
+{
+
+/** Aggregates the stream of blocks using the specified key columns and aggregate functions.
+  * Columns with aggregate functions adds to the end of the block.
+  * If final = false, the aggregate functions are not finalized, that is, they are not replaced by their value, but contain an intermediate state of calculations.
+  * This is necessary so that aggregation can continue (for example, by combining streams of partially aggregated data).
+  */
+class AggregatingBlockInputStream : public IBlockInputStream
+{
+public:
+    /** keys are taken from the GROUP BY part of the query
+      * Aggregate functions are searched everywhere in the expression.
+      * Columns corresponding to keys and arguments of aggregate functions must already be computed.
+      */
+    AggregatingBlockInputStream(const BlockInputStreamPtr & input, const Aggregator::Params & params_, bool final_)
+        : params(params_), aggregator(params), final(final_)
+    {
+        children.push_back(input);
+    }
+
+    String getName() const override { return "Aggregating"; }
+    Header getHeader() const override;
+
+protected:
+    Block readImpl() override;
+
+    Aggregator::Params params;
+    Aggregator aggregator;
+    bool final;
+
+    bool executed = false;
+
+     /** From here we will get the completed blocks after the aggregation. */
+    std::unique_ptr<IBlockInputStream> impl;
+};
+
+}
diff --git a/ydb/library/arrow_clickhouse/DataStreams/CMakeLists.txt b/ydb/library/arrow_clickhouse/DataStreams/CMakeLists.txt
new file mode 100644
index 00000000000..a150b18b741
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/DataStreams/CMakeLists.txt
@@ -0,0 +1,24 @@
+
+# This file was gererated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(library-arrow_clickhouse-DataStreams)
+target_include_directories(library-arrow_clickhouse-DataStreams PRIVATE
+  ${CMAKE_SOURCE_DIR}/ydb/library/yql/udfs/common/clickhouse/client/base
+  ${CMAKE_SOURCE_DIR}/ydb/library/arrow_clickhouse
+)
+target_link_libraries(library-arrow_clickhouse-DataStreams PUBLIC
+  contrib-libs-cxxsupp
+  yutil
+  libs-apache-arrow
+)
+target_sources(library-arrow_clickhouse-DataStreams PRIVATE
+  ${CMAKE_SOURCE_DIR}/ydb/library/arrow_clickhouse/DataStreams/AggregatingBlockInputStream.cpp
+  ${CMAKE_SOURCE_DIR}/ydb/library/arrow_clickhouse/DataStreams/IBlockInputStream.cpp
+  ${CMAKE_SOURCE_DIR}/ydb/library/arrow_clickhouse/DataStreams/MergingAggregatedBlockInputStream.cpp
+)
diff --git a/ydb/library/arrow_clickhouse/DataStreams/IBlockInputStream.cpp b/ydb/library/arrow_clickhouse/DataStreams/IBlockInputStream.cpp
new file mode 100644
index 00000000000..619255403dc
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/DataStreams/IBlockInputStream.cpp
@@ -0,0 +1,90 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#include <DataStreams/IBlockInputStream.h>
+
+namespace CH
+{
+
+
+/// It's safe to access children without mutex as long as these methods are called before first call to `read()` or `readPrefix()`.
+
+
+Block IBlockInputStream::read()
+{
+    Block res;
+    if (isCancelledOrThrowIfKilled())
+        return res;
+
+    res = readImpl();
+    if (!res)
+    {
+        /** If the stream is over, then we will ask all children to abort the execution.
+          * This makes sense when running a query with LIMIT
+          * - there is a situation when all the necessary data has already been read,
+          *   but children sources are still working,
+          *   herewith they can work in separate threads or even remotely.
+          */
+        cancel(false);
+    }
+
+    return res;
+}
+
+
+void IBlockInputStream::readPrefix()
+{
+    readPrefixImpl();
+
+    forEachChild([&] (IBlockInputStream & child)
+    {
+        child.readPrefix();
+        return false;
+    });
+}
+
+
+void IBlockInputStream::readSuffix()
+{
+    forEachChild([&] (IBlockInputStream & child)
+    {
+        child.readSuffix();
+        return false;
+    });
+
+    readSuffixImpl();
+}
+
+
+void IBlockInputStream::cancel(bool kill)
+{
+#if 0
+    if (kill)
+        is_killed = true;
+#endif
+    bool old_val = false;
+    if (!is_cancelled.compare_exchange_strong(old_val, true, std::memory_order_seq_cst, std::memory_order_relaxed))
+        return;
+
+    forEachChild([&] (IBlockInputStream & child)
+    {
+        child.cancel(kill);
+        return false;
+    });
+}
+
+
+bool IBlockInputStream::isCancelled() const
+{
+    return is_cancelled;
+}
+
+bool IBlockInputStream::isCancelledOrThrowIfKilled() const
+{
+    if (!is_cancelled)
+        return false;
+    return true;
+}
+
+}
diff --git a/ydb/library/arrow_clickhouse/DataStreams/IBlockInputStream.h b/ydb/library/arrow_clickhouse/DataStreams/IBlockInputStream.h
new file mode 100644
index 00000000000..7c11acb094e
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/DataStreams/IBlockInputStream.h
@@ -0,0 +1,118 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+#include "arrow_clickhouse_types.h"
+
+#include <DataStreams/IBlockStream_fwd.h>
+
+#include <atomic>
+#include <shared_mutex>
+
+
+namespace CH
+{
+
+/** The stream interface for reading data by blocks from the database.
+  * Relational operations are supposed to be done also as implementations of this interface.
+  * Watches out at how the source of the blocks works.
+  * Lets you get information for profiling: rows per second, blocks per second, megabytes per second, etc.
+  * Allows you to stop reading data (in nested sources).
+  */
+class IBlockInputStream
+{
+public:
+    IBlockInputStream() {}
+    virtual ~IBlockInputStream() {}
+
+    IBlockInputStream(const IBlockInputStream &) = delete;
+    IBlockInputStream & operator=(const IBlockInputStream &) = delete;
+
+    /// To output the data stream transformation tree (query execution plan).
+    virtual String getName() const = 0;
+
+    /** Get data structure of the stream in a form of "header" block (it is also called "sample block").
+      * Header block contains column names, data types, columns of size 0. Constant columns must have corresponding values.
+      * It is guaranteed that method "read" returns blocks of exactly that structure.
+      */
+    virtual Header getHeader() const = 0;
+
+    /** Read next block.
+      * If there are no more blocks, return an empty block (for which operator `bool` returns false).
+      * NOTE: Only one thread can read from one instance of IBlockInputStream simultaneously.
+      * This also applies for readPrefix, readSuffix.
+      */
+    Block read();
+
+    /** Read something before starting all data or after the end of all data.
+      * In the `readSuffix` function, you can implement a finalization that can lead to an exception.
+      * readPrefix() must be called before the first call to read().
+      * readSuffix() should be called after read() returns an empty block, or after a call to cancel(), but not during read() execution.
+      */
+
+    /** The default implementation calls readPrefixImpl() on itself, and then readPrefix() recursively for all children.
+      * There are cases when you do not want `readPrefix` of children to be called synchronously, in this function,
+      *  but you want them to be called, for example, in separate threads (for parallel initialization of children).
+      * Then overload `readPrefix` function.
+      */
+    virtual void readPrefix();
+
+    /** The default implementation calls recursively readSuffix() on all children, and then readSuffixImpl() on itself.
+      * If this stream calls read() in children in a separate thread, this behavior is usually incorrect:
+      * readSuffix() of the child can not be called at the moment when the same child's read() is executed in another thread.
+      * In this case, you need to override this method so that readSuffix() in children is called, for example, after connecting streams.
+      */
+    virtual void readSuffix();
+
+    /** Ask to abort the receipt of data as soon as possible.
+      * By default - just sets the flag is_cancelled and asks that all children be interrupted.
+      * This function can be called several times, including simultaneously from different threads.
+      * Have two modes:
+      *  with kill = false only is_cancelled is set - streams will stop silently with returning some processed data.
+      *  with kill = true also is_killed set - queries will stop with exception.
+      */
+    virtual void cancel(bool kill = false);
+
+    bool isCancelled() const;
+    bool isCancelledOrThrowIfKilled() const;
+
+protected:
+    BlockInputStreams children;
+    std::shared_mutex children_mutex;
+
+    std::atomic<bool> is_cancelled{false};
+
+    void addChild(const BlockInputStreamPtr & child)
+    {
+        std::unique_lock lock(children_mutex);
+        children.push_back(child);
+    }
+
+private:
+    /// Derived classes must implement this function.
+    virtual Block readImpl() = 0;
+
+    /// Here you can do a preliminary initialization.
+    virtual void readPrefixImpl() {}
+
+    /// Here you need to do a finalization, which can lead to an exception.
+    virtual void readSuffixImpl() {}
+
+    template <typename F>
+    void forEachChild(F && f)
+    {
+        /// NOTE: Acquire a read lock, therefore f() should be thread safe
+        std::shared_lock lock(children_mutex);
+
+        // Reduce lock scope and avoid recursive locking since that is undefined for shared_mutex.
+        const auto children_copy = children;
+        lock.unlock();
+
+        for (auto & child : children_copy)
+            if (f(*child))
+                return;
+    }
+};
+
+}
diff --git a/ydb/library/arrow_clickhouse/DataStreams/IBlockStream_fwd.h b/ydb/library/arrow_clickhouse/DataStreams/IBlockStream_fwd.h
new file mode 100644
index 00000000000..1bf79768c04
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/DataStreams/IBlockStream_fwd.h
@@ -0,0 +1,21 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+namespace CH
+{
+
+class IBlockInputStream;
+class IBlockOutputStream;
+
+using BlockInputStreamPtr = std::shared_ptr<IBlockInputStream>;
+using BlockInputStreams = std::vector<BlockInputStreamPtr>;
+using BlockOutputStreamPtr = std::shared_ptr<IBlockOutputStream>;
+using BlockOutputStreams = std::vector<BlockOutputStreamPtr>;
+
+}
diff --git a/ydb/library/arrow_clickhouse/DataStreams/MergingAggregatedBlockInputStream.cpp b/ydb/library/arrow_clickhouse/DataStreams/MergingAggregatedBlockInputStream.cpp
new file mode 100644
index 00000000000..2bd14e1371a
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/DataStreams/MergingAggregatedBlockInputStream.cpp
@@ -0,0 +1,43 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#include <DataStreams/MergingAggregatedBlockInputStream.h>
+
+
+namespace CH
+{
+
+Header MergingAggregatedBlockInputStream::getHeader() const
+{
+    return aggregator.getHeader(final);
+}
+
+
+Block MergingAggregatedBlockInputStream::readImpl()
+{
+    if (!executed)
+    {
+        executed = true;
+        AggregatedDataVariants data_variants;
+#if 0
+        Aggregator::CancellationHook hook = [&]() { return this->isCancelled(); };
+        aggregator.setCancellationHook(hook);
+#endif
+        aggregator.mergeStream(children.back(), data_variants);
+        blocks = aggregator.convertToBlocks(data_variants, final);
+        it = blocks.begin();
+    }
+
+    Block res;
+    if (isCancelledOrThrowIfKilled() || it == blocks.end())
+        return res;
+
+    res = std::move(*it);
+    ++it;
+
+    return res;
+}
+
+
+}
diff --git a/ydb/library/arrow_clickhouse/DataStreams/MergingAggregatedBlockInputStream.h b/ydb/library/arrow_clickhouse/DataStreams/MergingAggregatedBlockInputStream.h
new file mode 100644
index 00000000000..a4fa0274ac2
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/DataStreams/MergingAggregatedBlockInputStream.h
@@ -0,0 +1,42 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+#include "arrow_clickhouse_types.h"
+#include "Aggregator.h"
+#include <DataStreams/IBlockInputStream.h>
+
+
+namespace CH
+{
+
+/** A pre-aggregate stream of blocks in which each block is already aggregated.
+  * Aggregate functions in blocks should not be finalized so that their states can be merged.
+  */
+class MergingAggregatedBlockInputStream : public IBlockInputStream
+{
+public:
+    MergingAggregatedBlockInputStream(const BlockInputStreamPtr & input, const Aggregator::Params & params, bool final_)
+        : aggregator(params), final(final_)
+    {
+        children.push_back(input);
+    }
+
+    String getName() const override { return "MergingAggregated"; }
+
+    Header getHeader() const override;
+
+protected:
+    Block readImpl() override;
+
+private:
+    Aggregator aggregator;
+    bool final;
+
+    bool executed = false;
+    BlocksList blocks;
+    BlocksList::iterator it;
+};
+
+}
diff --git a/ydb/library/arrow_clickhouse/DataStreams/OneBlockInputStream.h b/ydb/library/arrow_clickhouse/DataStreams/OneBlockInputStream.h
new file mode 100644
index 00000000000..6735022c46e
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/DataStreams/OneBlockInputStream.h
@@ -0,0 +1,49 @@
+// The code in this file is based on original ClickHouse source code
+// which is licensed under Apache license v2.0
+// See: https://github.com/ClickHouse/ClickHouse/
+
+#pragma once
+#include "arrow_clickhouse_types.h"
+#include <DataStreams/IBlockInputStream.h>
+
+namespace CH
+{
+
+/** A stream of blocks from which you can read one block.
+  * Also see BlocksListBlockInputStream.
+  */
+class OneBlockInputStream : public IBlockInputStream
+{
+public:
+    explicit OneBlockInputStream(Block block_)
+        : block(std::move(block_))
+    {
+        if (!block->Validate().ok())
+            throw Exception("Bad batch in OneBlockInputStream");
+    }
+
+    String getName() const override { return "One"; }
+
+    Header getHeader() const override
+    {
+        if (!block)
+            return {};
+        return block->schema();
+    }
+
+protected:
+    Block readImpl() override
+    {
+        if (has_been_read)
+            return {};
+
+        has_been_read = true;
+        return block;
+    }
+
+private:
+    Block block{};
+    bool has_been_read = false;
+};
+
+}
diff --git a/ydb/library/arrow_clickhouse/README.md b/ydb/library/arrow_clickhouse/README.md
new file mode 100644
index 00000000000..89149bf4f99
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/README.md
@@ -0,0 +1,16 @@
+ClickHouse aggregate functions over Apache Arrow primitives
+--------
+
+This library is a modified ClickHouse (https://github.com/ClickHouse/ClickHouse/) code that uses Apache Arrow
+(https://arrow.apache.org/) primitives instead of ClickHouse native ones. I.e. it uses arrow::RecordBatch
+instead of DB::Block, arrow::Array and arrow::Builder instead of DB::IColumn and so on.
+The redefinition of types is in arrow_clickhouse_types.h header.
+
+The library uses DataStreams primitives that were replaced by processors in ClickHouse 20.3. It's not possible to
+extract processors from ClickHouse code base. It's too monolithic and depends on specific multithreading model.
+
+The core reason of library is posibility to use ClickHouse's GROUP BY code (Aggregator.cpp) and aggregate fucntions
+(AggregateFunctions directory) with minimal modifications over data presented in Apache Arrow formats.
+
+Original ClickHouse support 2-level aggregation and several optiumizations (LowCardinality, Sparse data, LLVM).
+Also it allows to add functions combinators to aggregate functions. Such optimizations are not implemented here yet.
diff --git a/ydb/library/arrow_clickhouse/arrow_clickhouse_types.h b/ydb/library/arrow_clickhouse/arrow_clickhouse_types.h
new file mode 100644
index 00000000000..698503007eb
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/arrow_clickhouse_types.h
@@ -0,0 +1,146 @@
+#pragma once
+#include <cstdint>
+#include <string>
+#include <vector>
+#include <list>
+#include <map>
+#include <stdexcept>
+
+#include <contrib/libs/apache/arrow/cpp/src/arrow/api.h>
+#include <contrib/libs/apache/arrow/cpp/src/arrow/compute/api.h>
+
+#include <common/StringRef.h>
+#include <common/extended_types.h>
+#include <common/defines.h>
+
+#include <Common/PODArray_fwd.h>
+
+namespace CH
+{
+
+using NDB::StringRef;
+using NDB::StringRefHash;
+using NDB::StringRefs;
+
+/// What to do if the limit is exceeded.
+enum class OverflowMode
+{
+    THROW     = 0,    /// Throw exception.
+    BREAK     = 1,    /// Abort query execution, return what is.
+
+    /** Only for GROUP BY: do not add new rows to the set,
+      * but continue to aggregate for keys that are already in the set.
+      */
+    ANY       = 2,
+};
+
+using Exception = std::runtime_error;
+using ColumnNumbers = std::vector<uint32_t>; // it's vector<size_t> in CH
+using Names = std::vector<std::string>;
+
+using Block = std::shared_ptr<arrow::RecordBatch>;
+using BlocksList = std::list<Block>;
+using Array = arrow::ScalarVector;
+using ColumnWithTypeAndName = arrow::Field;
+using ColumnsWithTypeAndName = arrow::FieldVector;
+using Header = std::shared_ptr<arrow::Schema>;
+using Sizes = std::vector<size_t>;
+
+// TODO: replace with arrow::memory_pool
+class Arena;
+using ArenaPtr = std::shared_ptr<Arena>;
+using ConstArenaPtr = std::shared_ptr<const Arena>;
+using ConstArenas = std::vector<ConstArenaPtr>;
+
+using IColumn = arrow::Array;
+using ColumnPtr = std::shared_ptr<IColumn>;
+using Columns = std::vector<ColumnPtr>;
+using ColumnRawPtrs = std::vector<const IColumn *>;
+
+using MutableColumn = arrow::ArrayBuilder;
+using MutableColumnPtr = std::shared_ptr<arrow::ArrayBuilder>;
+using MutableColumns = std::vector<MutableColumnPtr>;
+
+struct XColumn {
+    using Offset = UInt64;
+    using Offsets = PaddedPODArray<Offset>;
+
+    using ColumnIndex = UInt64;
+    using Selector = PaddedPODArray<ColumnIndex>;
+
+    using Filter = PaddedPODArray<UInt8>;
+};
+
+using ColumnInt8 = arrow::NumericArray<arrow::Int8Type>;
+using ColumnInt16 = arrow::NumericArray<arrow::Int16Type>;
+using ColumnInt32 = arrow::NumericArray<arrow::Int32Type>;
+using ColumnInt64 = arrow::NumericArray<arrow::Int64Type>;
+
+using ColumnUInt8 = arrow::NumericArray<arrow::UInt8Type>;
+using ColumnUInt16 = arrow::NumericArray<arrow::UInt16Type>;
+using ColumnUInt32 = arrow::NumericArray<arrow::UInt32Type>;
+using ColumnUInt64 = arrow::NumericArray<arrow::UInt64Type>;
+
+using ColumnFloat32 = arrow::NumericArray<arrow::FloatType>;
+using ColumnFloat64 = arrow::NumericArray<arrow::DoubleType>;
+
+using ColumnBinary = arrow::BinaryArray;
+using ColumnString = arrow::StringArray;
+using ColumnFixedString = arrow::FixedSizeBinaryArray;
+
+using MutableColumnInt8 = arrow::Int8Builder;
+using MutableColumnInt16 = arrow::Int16Builder;
+using MutableColumnInt32 = arrow::Int32Builder;
+using MutableColumnInt64 = arrow::Int64Builder;
+
+using MutableColumnUInt8 = arrow::UInt8Builder;
+using MutableColumnUInt16 = arrow::UInt16Builder;
+using MutableColumnUInt32 = arrow::UInt32Builder;
+using MutableColumnUInt64 = arrow::UInt64Builder;
+
+using MutableColumnFloat32 = arrow::FloatBuilder;
+using MutableColumnFloat64 = arrow::DoubleBuilder;
+
+using MutableColumnBinary = arrow::BinaryBuilder;
+using MutableColumnString = arrow::StringBuilder;
+using MutableColumnFixedString = arrow::FixedSizeBinaryBuilder;
+
+using IDataType = arrow::DataType;
+using DataTypePtr = std::shared_ptr<IDataType>;
+using DataTypes = arrow::DataTypeVector;
+
+using DataTypeInt8 = arrow::Int8Type;
+using DataTypeInt16 = arrow::Int16Type;
+using DataTypeInt32 = arrow::Int32Type;
+using DataTypeInt64 = arrow::Int64Type;
+
+using DataTypeUInt8 = arrow::UInt8Type;
+using DataTypeUInt16 = arrow::UInt16Type;
+using DataTypeUInt32 = arrow::UInt32Type;
+using DataTypeUInt64 = arrow::UInt64Type;
+
+using DataTypeFixedString = arrow::FixedSizeBinaryType;
+
+inline Columns columnsFromHeader(const Header& schema, size_t num_rows = 0)
+{
+    std::vector<std::shared_ptr<arrow::Array>> columns;
+    columns.reserve(schema->num_fields());
+
+    for (auto& field : schema->fields()) {
+        columns.emplace_back(*arrow::MakeArrayOfNull(field->type(), num_rows));
+    }
+    return columns;
+}
+
+inline Block blockFromHeader(const Header& schema, size_t num_rows = 0)
+{
+    return arrow::RecordBatch::Make(schema, num_rows, columnsFromHeader(schema, num_rows));
+}
+
+template <typename To, typename From>
+inline To assert_cast(From && from)
+{
+    return static_cast<To>(from);
+}
+
+}
diff --git a/ydb/library/arrow_clickhouse/ut/CMakeLists.darwin.txt b/ydb/library/arrow_clickhouse/ut/CMakeLists.darwin.txt
new file mode 100644
index 00000000000..1a9a9c7126e
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/ut/CMakeLists.darwin.txt
@@ -0,0 +1,41 @@
+
+# This file was gererated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_executable(ydb-library-arrow_clickhouse-ut)
+target_include_directories(ydb-library-arrow_clickhouse-ut PRIVATE
+  ${CMAKE_SOURCE_DIR}/ydb/library/arrow_clickhouse
+)
+target_link_libraries(ydb-library-arrow_clickhouse-ut PUBLIC
+  contrib-libs-cxxsupp
+  yutil
+  library-cpp-cpuid_check
+  cpp-testing-unittest_main
+  ydb-library-arrow_clickhouse
+)
+target_link_options(ydb-library-arrow_clickhouse-ut PRIVATE
+  -Wl,-no_deduplicate
+  -Wl,-sdk_version,10.15
+  -fPIC
+  -fPIC
+)
+target_sources(ydb-library-arrow_clickhouse-ut PRIVATE
+  ${CMAKE_SOURCE_DIR}/ydb/library/arrow_clickhouse/ut_aggregator.cpp
+)
+add_test(
+  NAME
+  ydb-library-arrow_clickhouse-ut
+  COMMAND
+  ydb-library-arrow_clickhouse-ut
+  --print-before-suite
+  --print-before-test
+  --fork-tests
+  --print-times
+  --show-fails
+)
+vcs_info(ydb-library-arrow_clickhouse-ut)
diff --git a/ydb/library/arrow_clickhouse/ut/CMakeLists.linux.txt b/ydb/library/arrow_clickhouse/ut/CMakeLists.linux.txt
new file mode 100644
index 00000000000..37e06a08220
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/ut/CMakeLists.linux.txt
@@ -0,0 +1,47 @@
+
+# This file was gererated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_executable(ydb-library-arrow_clickhouse-ut)
+target_include_directories(ydb-library-arrow_clickhouse-ut PRIVATE
+  ${CMAKE_SOURCE_DIR}/ydb/library/arrow_clickhouse
+)
+target_link_libraries(ydb-library-arrow_clickhouse-ut PUBLIC
+  contrib-libs-cxxsupp
+  yutil
+  cpp-malloc-tcmalloc
+  libs-tcmalloc-no_percpu_cache
+  library-cpp-cpuid_check
+  cpp-testing-unittest_main
+  ydb-library-arrow_clickhouse
+)
+target_link_options(ydb-library-arrow_clickhouse-ut PRIVATE
+  -ldl
+  -lrt
+  -Wl,--no-as-needed
+  -fPIC
+  -fPIC
+  -lpthread
+  -lrt
+  -ldl
+)
+target_sources(ydb-library-arrow_clickhouse-ut PRIVATE
+  ${CMAKE_SOURCE_DIR}/ydb/library/arrow_clickhouse/ut_aggregator.cpp
+)
+add_test(
+  NAME
+  ydb-library-arrow_clickhouse-ut
+  COMMAND
+  ydb-library-arrow_clickhouse-ut
+  --print-before-suite
+  --print-before-test
+  --fork-tests
+  --print-times
+  --show-fails
+)
+vcs_info(ydb-library-arrow_clickhouse-ut)
diff --git a/ydb/library/arrow_clickhouse/ut/CMakeLists.txt b/ydb/library/arrow_clickhouse/ut/CMakeLists.txt
new file mode 100644
index 00000000000..fc7b1ee73ce
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/ut/CMakeLists.txt
@@ -0,0 +1,13 @@
+
+# This file was gererated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (APPLE)
+  include(CMakeLists.darwin.txt)
+elseif (UNIX AND NOT APPLE)
+  include(CMakeLists.linux.txt)
+endif()
diff --git a/ydb/library/arrow_clickhouse/ut_aggregator.cpp b/ydb/library/arrow_clickhouse/ut_aggregator.cpp
new file mode 100644
index 00000000000..ca25f3aff3b
--- /dev/null
+++ b/ydb/library/arrow_clickhouse/ut_aggregator.cpp
@@ -0,0 +1,365 @@
+#include <array>
+#include <memory>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+#include "Aggregator.h"
+#include "DataStreams/OneBlockInputStream.h"
+#include "DataStreams/AggregatingBlockInputStream.h"
+#include "DataStreams/MergingAggregatedBlockInputStream.h"
+#include "AggregateFunctions/AggregateFunctionCount.h"
+#include "AggregateFunctions/AggregateFunctionMinMaxAny.h"
+#include "AggregateFunctions/AggregateFunctionSum.h"
+#include "AggregateFunctions/AggregateFunctionAvg.h"
+
+namespace CH {
+
+void RegisterAggregates(arrow::compute::FunctionRegistry * registry = nullptr) {
+    if (!registry)
+        registry = arrow::compute::GetFunctionRegistry();
+
+    registry->AddFunction(std::make_shared<CH::WrappedCount>("ch.count")).ok();
+    registry->AddFunction(std::make_shared<CH::WrappedMin>("ch.min")).ok();
+    registry->AddFunction(std::make_shared<CH::WrappedMax>("ch.max")).ok();
+    registry->AddFunction(std::make_shared<CH::WrappedAny>("ch.any")).ok();
+    registry->AddFunction(std::make_shared<CH::WrappedSum>("ch.sum")).ok();
+    registry->AddFunction(std::make_shared<CH::WrappedAvg>("ch.avg")).ok();
+}
+
+// {i16, ui32, s1, s2}
+Block makeTestBlock(size_t num_rows) {
+    std::vector<std::string> strings = {"abc", "def", "abcd", "defg", "ac"};
+
+    arrow::FieldVector fields;
+    arrow::ArrayVector columns;
+
+    {
+        auto field = std::make_shared<arrow::Field>("i16", arrow::int16());
+        arrow::Int16Builder col;
+        col.Reserve(num_rows).ok();
+
+        for (size_t i = 0; i < num_rows; ++i)
+            col.Append(i % 9).ok();
+
+        fields.emplace_back(std::move(field));
+        columns.emplace_back(std::move(*col.Finish()));
+    }
+
+    {
+        auto field = std::make_shared<arrow::Field>("ui32", arrow::uint32());
+        arrow::UInt32Builder col;
+        col.Reserve(num_rows).ok();
+
+        for (size_t i = 0; i < num_rows; ++i)
+            col.Append(i % 7).ok();
+
+        fields.emplace_back(std::move(field));
+        columns.emplace_back(std::move(*col.Finish()));
+    }
+
+    {
+        auto field = std::make_shared<arrow::Field>("s1", arrow::binary());
+        arrow::BinaryBuilder col;
+        col.Reserve(num_rows).ok();
+
+        for (size_t i = 0; i < num_rows; ++i)
+            col.Append(strings[i % strings.size()]).ok();
+
+        fields.emplace_back(std::move(field));
+        columns.emplace_back(std::move(*col.Finish()));
+    }
+
+    {
+        auto field = std::make_shared<arrow::Field>("s2", arrow::binary());
+        arrow::BinaryBuilder col;
+        col.Reserve(num_rows).ok();
+
+        for (size_t i = 0; i < num_rows; ++i)
+            col.Append(strings[i % 3]).ok();
+
+        fields.emplace_back(std::move(field));
+        columns.emplace_back(std::move(*col.Finish()));
+    }
+
+    return arrow::RecordBatch::Make(std::make_shared<arrow::Schema>(fields), num_rows, columns);
+}
+
+AggregateDescription MakeCountDescription(const std::string & column_name = "cnt")
+{
+    auto * registry = arrow::compute::GetFunctionRegistry();
+    auto func = registry->GetFunction("ch.count");
+    auto wrapped = std::static_pointer_cast<ArrowAggregateFunctionWrapper>(*func);
+
+    DataTypes empty_list_of_types;
+    return AggregateDescription {
+        .function = wrapped->getHouseFunction(empty_list_of_types),
+        .column_name = column_name
+    };
+}
+
+AggregateDescription MakeMinMaxAnyDescription(const std::string & agg_name, DataTypePtr data_type,
+                                                 uint32_t column_id)
+{
+    auto * registry = arrow::compute::GetFunctionRegistry();
+    auto func = registry->GetFunction(agg_name);
+    auto wrapped = std::static_pointer_cast<ArrowAggregateFunctionWrapper>(*func);
+
+    DataTypes list_of_types = {data_type};
+    return AggregateDescription {
+        .function = wrapped->getHouseFunction(list_of_types),
+        .arguments = {column_id},
+        .column_name = "res_" + agg_name
+    };
+}
+
+AggregateDescription MakeSumDescription(DataTypePtr data_type, uint32_t column_id,
+                                        const std::string & column_name = "res_sum")
+{
+    auto * registry = arrow::compute::GetFunctionRegistry();
+    auto func = registry->GetFunction("ch.sum");
+    auto wrapped = std::static_pointer_cast<ArrowAggregateFunctionWrapper>(*func);
+
+    DataTypes list_of_types = {data_type};
+    return AggregateDescription {
+        .function = wrapped->getHouseFunction(list_of_types),
+        .arguments = {column_id},
+        .column_name = column_name
+    };
+}
+
+AggregateDescription MakeAvgDescription(DataTypePtr data_type, uint32_t column_id,
+                                        const std::string & column_name = "res_avg")
+{
+    auto * registry = arrow::compute::GetFunctionRegistry();
+    auto func = registry->GetFunction("ch.avg");
+    auto wrapped = std::static_pointer_cast<ArrowAggregateFunctionWrapper>(*func);
+
+    DataTypes list_of_types = {data_type};
+    return AggregateDescription {
+        .function = wrapped->getHouseFunction(list_of_types),
+        .arguments = {column_id},
+        .column_name = column_name
+    };
+}
+
+BlockInputStreamPtr MakeAggregatingStream(const BlockInputStreamPtr & stream,
+                                          const ColumnNumbers & agg_keys,
+                                          const AggregateDescriptions & aggregate_descriptions)
+{
+    Header src_header = stream->getHeader();
+    Aggregator::Params agg_params(false, src_header, agg_keys, aggregate_descriptions, false);
+    BlockInputStreamPtr agg_stream = std::make_shared<AggregatingBlockInputStream>(stream, agg_params, false);
+
+    ColumnNumbers merge_keys;
+    {
+        Header agg_header = agg_stream->getHeader();
+        for (const auto & key : agg_keys)
+            merge_keys.push_back(agg_header->GetFieldIndex(src_header->field(key)->name()));
+    }
+
+    Aggregator::Params merge_params(true, agg_stream->getHeader(), merge_keys, aggregate_descriptions, false);
+    return std::make_shared<MergingAggregatedBlockInputStream>(agg_stream, merge_params, true);
+}
+
+bool TestExecute(const Block & block, const ColumnNumbers & agg_keys)
+{
+    try
+    {
+        BlockInputStreamPtr stream = std::make_shared<OneBlockInputStream>(block);
+
+        AggregateDescription aggregate_description = MakeCountDescription();
+        Aggregator::Params params(false, stream->getHeader(), agg_keys, {aggregate_description}, false);
+        Aggregator aggregator(params);
+
+        AggregatedDataVariants aggregated_data_variants;
+
+        {
+            //Stopwatch stopwatch;
+            //stopwatch.start();
+
+            aggregator.execute(stream, aggregated_data_variants);
+
+            //stopwatch.stop();
+            //std::cout << std::fixed << std::setprecision(2)
+            //    << "Elapsed " << stopwatch.elapsedSeconds() << " sec."
+            //    << ", " << n / stopwatch.elapsedSeconds() << " rows/sec."
+            //    << std::endl;
+        }
+    }
+    catch (const Exception & e)
+    {
+        std::cerr << e.what() << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+size_t TestAggregate(const Block & block, const ColumnNumbers & agg_keys, const AggregateDescription & description)
+{
+    size_t rows = 0;
+
+    try
+    {
+        std::cerr << "aggregate by keys: ";
+        for (auto& key : agg_keys) {
+            std::cerr << key << " ";
+        }
+        std::cerr << std::endl;
+
+        auto stream = MakeAggregatingStream(std::make_shared<OneBlockInputStream>(block), agg_keys, {description});
+
+        while (auto block = stream->read()) {
+            std::cerr << "result rows: " << block->num_rows() << std::endl;
+            rows += block->num_rows();
+        }
+    }
+    catch (const Exception & e)
+    {
+        std::cerr << e.what() << std::endl;
+        return 0;
+    }
+
+    return rows;
+}
+
+}
+
+
+Y_UNIT_TEST_SUITE(CH_Aggregator) {
+    Y_UNIT_TEST(ExecuteCount) {
+        CH::RegisterAggregates();
+
+        auto block = CH::makeTestBlock(1000);
+
+        UNIT_ASSERT(CH::TestExecute(block, {0, 1}));
+        UNIT_ASSERT(CH::TestExecute(block, {1, 0}));
+        UNIT_ASSERT(CH::TestExecute(block, {0, 2}));
+        UNIT_ASSERT(CH::TestExecute(block, {2, 0}));
+        UNIT_ASSERT(CH::TestExecute(block, {2, 3}));
+        UNIT_ASSERT(CH::TestExecute(block, {0, 1, 2, 3}));
+    }
+
+    Y_UNIT_TEST(AggregateCount) {
+        CH::RegisterAggregates();
+
+        auto block = CH::makeTestBlock(1000);
+
+        auto agg_count = CH::MakeCountDescription();
+
+        UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {0, 1}, agg_count), 9*7);
+        UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {1, 0}, agg_count), 7*9);
+        UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {0, 2}, agg_count), 9*5);
+        UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {2, 0}, agg_count), 5*9);
+        UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {2, 3}, agg_count), 5*3);
+        UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {0, 1, 2, 3}, agg_count), 9*7*5);
+    }
+
+    Y_UNIT_TEST(AggregateMin) {
+        CH::RegisterAggregates();
+
+        auto block = CH::makeTestBlock(1000);
+
+        for (int i = 0; i < block->num_columns(); ++i) {
+            auto type = block->column(i)->type();
+            auto agg_descr = CH::MakeMinMaxAnyDescription("ch.min", type, i);
+
+            UNIT_ASSERT(agg_descr.function);
+            UNIT_ASSERT_VALUES_EQUAL(agg_descr.arguments.size(), 1);
+
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {0, 1}, agg_descr), 9*7);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {1, 0}, agg_descr), 7*9);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {0, 2}, agg_descr), 9*5);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {2, 0}, agg_descr), 5*9);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {2, 3}, agg_descr), 5*3);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {0, 1, 2, 3}, agg_descr), 9*7*5);
+        }
+    }
+
+    Y_UNIT_TEST(AggregateMax) {
+        CH::RegisterAggregates();
+
+        auto block = CH::makeTestBlock(1000);
+
+        for (int i = 0; i < block->num_columns(); ++i) {
+            auto type = block->column(i)->type();
+            auto agg_descr = CH::MakeMinMaxAnyDescription("ch.max", type, i);
+
+            UNIT_ASSERT(agg_descr.function);
+            UNIT_ASSERT_VALUES_EQUAL(agg_descr.arguments.size(), 1);
+
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {0, 1}, agg_descr), 9*7);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {1, 0}, agg_descr), 7*9);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {0, 2}, agg_descr), 9*5);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {2, 0}, agg_descr), 5*9);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {2, 3}, agg_descr), 5*3);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {0, 1, 2, 3}, agg_descr), 9*7*5);
+        }
+    }
+
+    Y_UNIT_TEST(AggregateAny) {
+        CH::RegisterAggregates();
+
+        auto block = CH::makeTestBlock(1000);
+
+        for (int i = 0; i < block->num_columns(); ++i) {
+            auto type = block->column(i)->type();
+            auto agg_descr = CH::MakeMinMaxAnyDescription("ch.any", type, i);
+
+            UNIT_ASSERT(agg_descr.function);
+            UNIT_ASSERT_VALUES_EQUAL(agg_descr.arguments.size(), 1);
+
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {0, 1}, agg_descr), 9*7);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {1, 0}, agg_descr), 7*9);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {0, 2}, agg_descr), 9*5);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {2, 0}, agg_descr), 5*9);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {2, 3}, agg_descr), 5*3);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {0, 1, 2, 3}, agg_descr), 9*7*5);
+        }
+    }
+
+    Y_UNIT_TEST(AggregateSum) {
+        CH::RegisterAggregates();
+
+        auto block = CH::makeTestBlock(1000);
+
+        for (int i = 0; i < 2; ++i) {
+            auto type = block->column(i)->type();
+            auto agg_descr = CH::MakeSumDescription(type, i);
+
+            UNIT_ASSERT(agg_descr.function);
+            UNIT_ASSERT_VALUES_EQUAL(agg_descr.arguments.size(), 1);
+
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {0, 1}, agg_descr), 9*7);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {1, 0}, agg_descr), 7*9);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {0, 2}, agg_descr), 9*5);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {2, 0}, agg_descr), 5*9);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {2, 3}, agg_descr), 5*3);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {0, 1, 2, 3}, agg_descr), 9*7*5);
+        }
+    }
+
+    Y_UNIT_TEST(AggregateAvg) {
+        CH::RegisterAggregates();
+
+        auto block = CH::makeTestBlock(1000);
+
+        for (int i = 0; i < 2; ++i) {
+            auto type = block->column(i)->type();
+            auto agg_descr = CH::MakeAvgDescription(type, i);
+
+            UNIT_ASSERT(agg_descr.function);
+            UNIT_ASSERT_VALUES_EQUAL(agg_descr.arguments.size(), 1);
+
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {0, 1}, agg_descr), 9*7);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {1, 0}, agg_descr), 7*9);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {0, 2}, agg_descr), 9*5);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {2, 0}, agg_descr), 5*9);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {2, 3}, agg_descr), 5*3);
+            UNIT_ASSERT_VALUES_EQUAL(CH::TestAggregate(block, {0, 1, 2, 3}, agg_descr), 9*7*5);
+        }
+    }
+}
author	chertus <azuikov@ydb.tech>	2022-08-25 16:08:44 +0300
committer	chertus <azuikov@ydb.tech>	2022-08-25 16:08:44 +0300
commit	ea542b16aeaff26273efaba14569b1d35a24a0ee (patch)
tree	da30600f503c4f4d08f9b2194f1b4f1438a30fd0
parent	a9cf4253d01d3c7ef62eeb9e798db23942ef1bcd (diff)
download	ydb-ea542b16aeaff26273efaba14569b1d35a24a0ee.tar.gz