aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexander Smirnov <alex@ydb.tech>2024-03-26 16:09:04 +0000
committerAlexander Smirnov <alex@ydb.tech>2024-03-26 16:09:04 +0000
commite29921c497f3eaf5514e6bbaf0cfaf2a8c12a16b (patch)
treec657b1cf75090c1a4f8f97bada5fbe0a8fc05c4f
parent39a80f0ebe20ed650360de0c38e17da3902b62b9 (diff)
parentce034cd07e7ebbff42723e51067bac58d1943f47 (diff)
downloadydb-e29921c497f3eaf5514e6bbaf0cfaf2a8c12a16b.tar.gz
Merge branch 'rightlib' into mergelibs-240326-1608
-rw-r--r--build/mapping.conf.json6
-rw-r--r--build/platform/test_tool/host.ya.make.inc10
-rw-r--r--build/platform/test_tool/host_os.ya.make.inc10
-rw-r--r--build/ymake.core.conf2
-rw-r--r--contrib/libs/googleapis-common-protos/CHANGELOG.md9
-rw-r--r--contrib/libs/googleapis-common-protos/google/api/client.proto20
-rw-r--r--contrib/libs/googleapis-common-protos/google/api/documentation.proto2
-rw-r--r--contrib/libs/googleapis-common-protos/google/api/error_reason.proto19
-rw-r--r--contrib/libs/googleapis-common-protos/google/api/field_behavior.proto2
-rw-r--r--contrib/libs/googleapis-common-protos/google/api/field_info.proto6
-rw-r--r--contrib/libs/googleapis-common-protos/google/api/monitored_resource.proto2
-rw-r--r--contrib/libs/googleapis-common-protos/ya.make4
-rw-r--r--contrib/python/hypothesis/py3/.dist-info/METADATA2
-rw-r--r--contrib/python/hypothesis/py3/hypothesis/extra/numpy.py186
-rw-r--r--contrib/python/hypothesis/py3/hypothesis/strategies/_internal/strategies.py5
-rw-r--r--contrib/python/hypothesis/py3/hypothesis/version.py2
-rw-r--r--contrib/python/hypothesis/py3/ya.make2
-rw-r--r--library/cpp/dot_product/README.md15
-rw-r--r--library/cpp/dot_product/dot_product.cpp274
-rw-r--r--library/cpp/dot_product/dot_product.h96
-rw-r--r--library/cpp/dot_product/dot_product_avx2.cpp344
-rw-r--r--library/cpp/dot_product/dot_product_avx2.h19
-rw-r--r--library/cpp/dot_product/dot_product_simple.cpp44
-rw-r--r--library/cpp/dot_product/dot_product_simple.h40
-rw-r--r--library/cpp/dot_product/dot_product_sse.cpp219
-rw-r--r--library/cpp/dot_product/dot_product_sse.h19
-rw-r--r--library/cpp/dot_product/ya.make20
-rw-r--r--library/cpp/tld/tlds-alpha-by-domain.txt2
-rwxr-xr-xya20
-rw-r--r--yt/yt/core/actions/future-inl.h26
-rw-r--r--yt/yt/core/actions/future.h24
-rw-r--r--yt/yt/core/rpc/roaming_channel.cpp4
-rw-r--r--yt/yt/core/ytree/ypath_client.cpp2
33 files changed, 1412 insertions, 45 deletions
diff --git a/build/mapping.conf.json b/build/mapping.conf.json
index 4e44f80f953..bb03e59640b 100644
--- a/build/mapping.conf.json
+++ b/build/mapping.conf.json
@@ -161,6 +161,8 @@
"6010598329": "https://devtools-registry.s3.yandex.net/6010598329",
"6033064182": "https://devtools-registry.s3.yandex.net/6033064182",
"6033072818": "https://devtools-registry.s3.yandex.net/6033072818",
+ "6052179215": "https://devtools-registry.s3.yandex.net/6052179215",
+ "6052374369": "https://devtools-registry.s3.yandex.net/6052374369",
"5486731632": "https://devtools-registry.s3.yandex.net/5486731632",
"5514350352": "https://devtools-registry.s3.yandex.net/5514350352",
"5514360398": "https://devtools-registry.s3.yandex.net/5514360398",
@@ -376,6 +378,7 @@
"5647712429": "https://devtools-registry.s3.yandex.net/5647712429",
"5690801745": "https://devtools-registry.s3.yandex.net/5690801745",
"5731299437": "https://devtools-registry.s3.yandex.net/5731299437",
+ "6048579718": "https://devtools-registry.s3.yandex.net/6048579718",
"2980468199": "https://devtools-registry.s3.yandex.net/2980468199",
"5562224408": "https://devtools-registry.s3.yandex.net/5562224408"
},
@@ -540,6 +543,8 @@
"6010598329": "devtools/ya/test/programs/test_tool/bin/test_tool for linux",
"6033064182": "devtools/ya/test/programs/test_tool/bin/test_tool for linux",
"6033072818": "devtools/ya/test/programs/test_tool/bin/test_tool for linux",
+ "6052179215": "devtools/ya/test/programs/test_tool/bin/test_tool for linux",
+ "6052374369": "devtools/ya/test/programs/test_tool/bin/test_tool for linux",
"5486731632": "devtools/ya/test/programs/test_tool/bin3/test_tool3 for linux",
"5514350352": "devtools/ya/test/programs/test_tool/bin3/test_tool3 for linux",
"5514360398": "devtools/ya/test/programs/test_tool/bin3/test_tool3 for linux",
@@ -755,6 +760,7 @@
"5647712429": "ymake.exe for win32-clang-cl",
"5690801745": "ymake.exe for win32-clang-cl",
"5731299437": "ymake.exe for win32-clang-cl",
+ "6048579718": "yt/go/ytrecipe/cmd/ytexec for linux",
"2980468199": "ytexec for linux",
"5562224408": "ytexec for linux"
},
diff --git a/build/platform/test_tool/host.ya.make.inc b/build/platform/test_tool/host.ya.make.inc
index 96bc78107c6..fdf84687f2f 100644
--- a/build/platform/test_tool/host.ya.make.inc
+++ b/build/platform/test_tool/host.ya.make.inc
@@ -1,12 +1,12 @@
IF (HOST_OS_DARWIN AND HOST_ARCH_X86_64)
- DECLARE_EXTERNAL_RESOURCE(TEST_TOOL_HOST sbr:6033069924)
+ DECLARE_EXTERNAL_RESOURCE(TEST_TOOL_HOST sbr:6052371779)
ELSEIF (HOST_OS_DARWIN AND HOST_ARCH_ARM64)
- DECLARE_EXTERNAL_RESOURCE(TEST_TOOL_HOST sbr:6033068666)
+ DECLARE_EXTERNAL_RESOURCE(TEST_TOOL_HOST sbr:6052370170)
ELSEIF (HOST_OS_LINUX AND HOST_ARCH_X86_64)
- DECLARE_EXTERNAL_RESOURCE(TEST_TOOL_HOST sbr:6033072818)
+ DECLARE_EXTERNAL_RESOURCE(TEST_TOOL_HOST sbr:6052374369)
ELSEIF (HOST_OS_LINUX AND HOST_ARCH_AARCH64)
- DECLARE_EXTERNAL_RESOURCE(TEST_TOOL_HOST sbr:6033067034)
+ DECLARE_EXTERNAL_RESOURCE(TEST_TOOL_HOST sbr:6052369086)
ELSEIF (HOST_OS_WINDOWS AND HOST_ARCH_X86_64)
- DECLARE_EXTERNAL_RESOURCE(TEST_TOOL_HOST sbr:6033071154)
+ DECLARE_EXTERNAL_RESOURCE(TEST_TOOL_HOST sbr:6052372920)
ENDIF()
diff --git a/build/platform/test_tool/host_os.ya.make.inc b/build/platform/test_tool/host_os.ya.make.inc
index 4a45d9291e8..f28c64e58f5 100644
--- a/build/platform/test_tool/host_os.ya.make.inc
+++ b/build/platform/test_tool/host_os.ya.make.inc
@@ -1,12 +1,12 @@
IF (HOST_OS_DARWIN AND HOST_ARCH_X86_64)
- DECLARE_EXTERNAL_RESOURCE(TEST_TOOL_HOST sbr:6033061022)
+ DECLARE_EXTERNAL_RESOURCE(TEST_TOOL_HOST sbr:6052177415)
ELSEIF (HOST_OS_DARWIN AND HOST_ARCH_ARM64)
- DECLARE_EXTERNAL_RESOURCE(TEST_TOOL_HOST sbr:6033058759)
+ DECLARE_EXTERNAL_RESOURCE(TEST_TOOL_HOST sbr:6052176424)
ELSEIF (HOST_OS_LINUX AND HOST_ARCH_X86_64)
- DECLARE_EXTERNAL_RESOURCE(TEST_TOOL_HOST sbr:6033064182)
+ DECLARE_EXTERNAL_RESOURCE(TEST_TOOL_HOST sbr:6052179215)
ELSEIF (HOST_OS_LINUX AND HOST_ARCH_AARCH64)
- DECLARE_EXTERNAL_RESOURCE(TEST_TOOL_HOST sbr:6033056567)
+ DECLARE_EXTERNAL_RESOURCE(TEST_TOOL_HOST sbr:6052175367)
ELSEIF (HOST_OS_WINDOWS AND HOST_ARCH_X86_64)
- DECLARE_EXTERNAL_RESOURCE(TEST_TOOL_HOST sbr:6033062527)
+ DECLARE_EXTERNAL_RESOURCE(TEST_TOOL_HOST sbr:6052178311)
ENDIF()
diff --git a/build/ymake.core.conf b/build/ymake.core.conf
index f030e4dd0a2..f5ac24fb715 100644
--- a/build/ymake.core.conf
+++ b/build/ymake.core.conf
@@ -3158,6 +3158,7 @@ macro _SRC("cpp", SRC, SRCFLAGS...) {
# tag:src-processing
macro _SRC("cxx", SRC, SRCFLAGS...) {
.CMD=$_SRC_CPP_CMD_NEW
+ .SEM=target_sources PRIVATE ${input:SRC} ${hide;output;suf=${OBJ_SUF}.o:SRC} $_ADD_EXTRA_FLAGS($SRCFLAGS)
.STRUCT_CMD=yes
}
@@ -3178,6 +3179,7 @@ macro _SRC("auxcpp", SRC, SRCFLAGS...) {
# tag:src-processing
macro _SRC("C", SRC, SRCFLAGS...) {
.CMD=$_SRC_CPP_CMD_NEW
+ .SEM=target_sources PRIVATE ${input:SRC} ${hide;output;suf=${OBJ_SUF}.o:SRC} $_ADD_EXTRA_FLAGS($SRCFLAGS)
.STRUCT_CMD=yes
}
diff --git a/contrib/libs/googleapis-common-protos/CHANGELOG.md b/contrib/libs/googleapis-common-protos/CHANGELOG.md
index 35b025b48bd..a331bd57c59 100644
--- a/contrib/libs/googleapis-common-protos/CHANGELOG.md
+++ b/contrib/libs/googleapis-common-protos/CHANGELOG.md
@@ -1,5 +1,14 @@
# Changelog
+## [1.63.0](https://github.com/googleapis/python-api-common-protos/compare/v1.62.0...v1.63.0) (2024-03-08)
+
+
+### Features
+
+* Add `api_version` field to `ServiceOptions` in `google/api/client.proto` ([6f9c4d2](https://github.com/googleapis/python-api-common-protos/commit/6f9c4d2b4b787d9ed2b447d7b99281aa3dcf97b5))
+* Add `LOCATION_POLICY_VIOLATED` enum to `ErrorReason` in `google/api/error_reason.proto` ([6f9c4d2](https://github.com/googleapis/python-api-common-protos/commit/6f9c4d2b4b787d9ed2b447d7b99281aa3dcf97b5))
+* Add `rest_reference_documentation_uri` field to `ServiceOptions` in `google/api/client.proto` ([6f9c4d2](https://github.com/googleapis/python-api-common-protos/commit/6f9c4d2b4b787d9ed2b447d7b99281aa3dcf97b5))
+
## [1.62.0](https://github.com/googleapis/python-api-common-protos/compare/v1.61.0...v1.62.0) (2023-12-01)
diff --git a/contrib/libs/googleapis-common-protos/google/api/client.proto b/contrib/libs/googleapis-common-protos/google/api/client.proto
index 39bdde82a95..0952e8373c7 100644
--- a/contrib/libs/googleapis-common-protos/google/api/client.proto
+++ b/contrib/libs/googleapis-common-protos/google/api/client.proto
@@ -98,6 +98,22 @@ extend google.protobuf.ServiceOptions {
// ...
// }
string oauth_scopes = 1050;
+
+ // The API version of this service, which should be sent by version-aware
+ // clients to the service. This allows services to abide by the schema and
+ // behavior of the service at the time this API version was deployed.
+ // The format of the API version must be treated as opaque by clients.
+ // Services may use a format with an apparent structure, but clients must
+ // not rely on this to determine components within an API version, or attempt
+ // to construct other valid API versions. Note that this is for upcoming
+ // functionality and may not be implemented for all services.
+ //
+ // Example:
+ //
+ // service Foo {
+ // option (google.api.api_version) = "v1_20230821_preview";
+ // }
+ string api_version = 525000001;
}
// Required information for every language.
@@ -192,6 +208,10 @@ message Publishing {
// Optional link to proto reference documentation. Example:
// https://cloud.google.com/pubsub/lite/docs/reference/rpc
string proto_reference_documentation_uri = 110;
+
+ // Optional link to REST reference documentation. Example:
+ // https://cloud.google.com/pubsub/lite/docs/reference/rest
+ string rest_reference_documentation_uri = 111;
}
// Settings for Java client libraries.
diff --git a/contrib/libs/googleapis-common-protos/google/api/documentation.proto b/contrib/libs/googleapis-common-protos/google/api/documentation.proto
index 0dabdfcf80b..12936c701b4 100644
--- a/contrib/libs/googleapis-common-protos/google/api/documentation.proto
+++ b/contrib/libs/googleapis-common-protos/google/api/documentation.proto
@@ -34,7 +34,7 @@ option objc_class_prefix = "GAPI";
// content: &#40;== include google/foo/overview.md ==&#41;
// - name: Tutorial
// content: &#40;== include google/foo/tutorial.md ==&#41;
-// subpages;
+// subpages:
// - name: Java
// content: &#40;== include google/foo/tutorial_java.md ==&#41;
// rules:
diff --git a/contrib/libs/googleapis-common-protos/google/api/error_reason.proto b/contrib/libs/googleapis-common-protos/google/api/error_reason.proto
index c0509be4fc5..cf806698dac 100644
--- a/contrib/libs/googleapis-common-protos/google/api/error_reason.proto
+++ b/contrib/libs/googleapis-common-protos/google/api/error_reason.proto
@@ -567,4 +567,23 @@ enum ErrorReason {
//
// This response indicates the associated GCP account has been suspended.
GCP_SUSPENDED = 30;
+
+ // The request violates the location policies when creating resources in
+ // the restricted region.
+ //
+ // Example of an ErrorInfo when creating the Cloud Storage Bucket by
+ // "projects/123" for service storage.googleapis.com:
+ //
+ // { "reason": "LOCATION_POLICY_VIOLATED",
+ // "domain": "googleapis.com",
+ // "metadata": {
+ // "consumer": "projects/123",
+ // "service": "storage.googleapis.com",
+ // }
+ // }
+ //
+ // This response indicates creating the Cloud Storage Bucket in
+ // "locations/asia-northeast3" violates at least one location policy.
+ // The troubleshooting guidance is provided in the Help links.
+ LOCATION_POLICY_VIOLATED = 31;
}
diff --git a/contrib/libs/googleapis-common-protos/google/api/field_behavior.proto b/contrib/libs/googleapis-common-protos/google/api/field_behavior.proto
index 344cb0b1fc2..21895bf5527 100644
--- a/contrib/libs/googleapis-common-protos/google/api/field_behavior.proto
+++ b/contrib/libs/googleapis-common-protos/google/api/field_behavior.proto
@@ -37,7 +37,7 @@ extend google.protobuf.FieldOptions {
// google.protobuf.Timestamp expire_time = 1
// [(google.api.field_behavior) = OUTPUT_ONLY,
// (google.api.field_behavior) = IMMUTABLE];
- repeated google.api.FieldBehavior field_behavior = 1052;
+ repeated google.api.FieldBehavior field_behavior = 1052 [packed = false];
}
// An indicator of the behavior of a given field (for example, that a field
diff --git a/contrib/libs/googleapis-common-protos/google/api/field_info.proto b/contrib/libs/googleapis-common-protos/google/api/field_info.proto
index dd66340a025..e62d84579d4 100644
--- a/contrib/libs/googleapis-common-protos/google/api/field_info.proto
+++ b/contrib/libs/googleapis-common-protos/google/api/field_info.proto
@@ -61,9 +61,9 @@ message FieldInfo {
// Internet Protocol v6 value as defined by [RFC
// 2460](https://datatracker.ietf.org/doc/html/rfc2460). The value may be
- // normalized to entirely lowercase letters, and zero-padded partial and
- // empty octets. For example, the value `2001:DB8::` would be normalized to
- // `2001:0db8:0:0`.
+ // normalized to entirely lowercase letters with zeros compressed, following
+ // [RFC 5952](https://datatracker.ietf.org/doc/html/rfc5952). For example,
+ // the value `2001:0DB8:0::0` would be normalized to `2001:db8::`.
IPV6 = 3;
// An IP address in either v4 or v6 format as described by the individual
diff --git a/contrib/libs/googleapis-common-protos/google/api/monitored_resource.proto b/contrib/libs/googleapis-common-protos/google/api/monitored_resource.proto
index c6f9759288e..08bc39b1c23 100644
--- a/contrib/libs/googleapis-common-protos/google/api/monitored_resource.proto
+++ b/contrib/libs/googleapis-common-protos/google/api/monitored_resource.proto
@@ -49,7 +49,7 @@ message MonitoredResourceDescriptor {
// Required. The monitored resource type. For example, the type
// `"cloudsql_database"` represents databases in Google Cloud SQL.
- // For a list of types, see [Monitoring resource
+ // For a list of types, see [Monitored resource
// types](https://cloud.google.com/monitoring/api/resources)
// and [Logging resource
// types](https://cloud.google.com/logging/docs/api/v2/resource-list).
diff --git a/contrib/libs/googleapis-common-protos/ya.make b/contrib/libs/googleapis-common-protos/ya.make
index 78f9707f391..959c90fde4b 100644
--- a/contrib/libs/googleapis-common-protos/ya.make
+++ b/contrib/libs/googleapis-common-protos/ya.make
@@ -6,9 +6,9 @@ LICENSE(Apache-2.0)
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-VERSION(1.62.0)
+VERSION(1.63.0)
-ORIGINAL_SOURCE(https://github.com/googleapis/python-api-common-protos/archive/v1.62.0.tar.gz)
+ORIGINAL_SOURCE(https://github.com/googleapis/python-api-common-protos/archive/v1.63.0.tar.gz)
PY_NAMESPACE(.)
diff --git a/contrib/python/hypothesis/py3/.dist-info/METADATA b/contrib/python/hypothesis/py3/.dist-info/METADATA
index d1464866e40..e8d710ced7c 100644
--- a/contrib/python/hypothesis/py3/.dist-info/METADATA
+++ b/contrib/python/hypothesis/py3/.dist-info/METADATA
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: hypothesis
-Version: 6.99.2
+Version: 6.99.4
Summary: A library for property-based testing
Home-page: https://hypothesis.works
Author: David R. MacIver and Zac Hatfield-Dodds
diff --git a/contrib/python/hypothesis/py3/hypothesis/extra/numpy.py b/contrib/python/hypothesis/py3/hypothesis/extra/numpy.py
index e8a9d3cb152..90ecb400b42 100644
--- a/contrib/python/hypothesis/py3/hypothesis/extra/numpy.py
+++ b/contrib/python/hypothesis/py3/hypothesis/extra/numpy.py
@@ -624,11 +624,61 @@ def dtype_factory(kind, sizes, valid_sizes, endianness):
return strat.map((endianness + kind).format)
+@overload
+@defines_dtype_strategy
+def unsigned_integer_dtypes(
+ *,
+ endianness: str = "?",
+ sizes: Literal[8],
+) -> st.SearchStrategy["np.dtype[np.uint8]"]: ...
+
+
+@overload
+@defines_dtype_strategy
+def unsigned_integer_dtypes(
+ *,
+ endianness: str = "?",
+ sizes: Literal[16],
+) -> st.SearchStrategy["np.dtype[np.uint16]"]: ...
+
+
+@overload
+@defines_dtype_strategy
+def unsigned_integer_dtypes(
+ *,
+ endianness: str = "?",
+ sizes: Literal[32],
+) -> st.SearchStrategy["np.dtype[np.uint32]"]: ...
+
+
+@overload
+@defines_dtype_strategy
+def unsigned_integer_dtypes(
+ *,
+ endianness: str = "?",
+ sizes: Literal[64],
+) -> st.SearchStrategy["np.dtype[np.uint64]"]: ...
+
+
+@overload
@defines_dtype_strategy
def unsigned_integer_dtypes(
*,
endianness: str = "?",
sizes: Sequence[Literal[8, 16, 32, 64]] = (8, 16, 32, 64),
+) -> st.SearchStrategy["np.dtype[np.unsignedinteger[Any]]"]: ...
+
+
+@defines_dtype_strategy
+def unsigned_integer_dtypes(
+ *,
+ endianness: str = "?",
+ sizes: Union[Literal[8, 16, 32, 64], Sequence[Literal[8, 16, 32, 64]]] = (
+ 8,
+ 16,
+ 32,
+ 64,
+ ),
) -> st.SearchStrategy["np.dtype[np.unsignedinteger[Any]]"]:
"""Return a strategy for unsigned integer dtypes.
@@ -642,11 +692,61 @@ def unsigned_integer_dtypes(
return dtype_factory("u", sizes, (8, 16, 32, 64), endianness)
+@overload
+@defines_dtype_strategy
+def integer_dtypes(
+ *,
+ endianness: str = "?",
+ sizes: Literal[8],
+) -> st.SearchStrategy["np.dtype[np.int8]"]: ...
+
+
+@overload
+@defines_dtype_strategy
+def integer_dtypes(
+ *,
+ endianness: str = "?",
+ sizes: Literal[16],
+) -> st.SearchStrategy["np.dtype[np.int16]"]: ...
+
+
+@overload
+@defines_dtype_strategy
+def integer_dtypes(
+ *,
+ endianness: str = "?",
+ sizes: Literal[32],
+) -> st.SearchStrategy["np.dtype[np.int32]"]: ...
+
+
+@overload
+@defines_dtype_strategy
+def integer_dtypes(
+ *,
+ endianness: str = "?",
+ sizes: Literal[64],
+) -> st.SearchStrategy["np.dtype[np.int64]"]: ...
+
+
+@overload
@defines_dtype_strategy
def integer_dtypes(
*,
endianness: str = "?",
sizes: Sequence[Literal[8, 16, 32, 64]] = (8, 16, 32, 64),
+) -> st.SearchStrategy["np.dtype[np.signedinteger[Any]]"]: ...
+
+
+@defines_dtype_strategy
+def integer_dtypes(
+ *,
+ endianness: str = "?",
+ sizes: Union[Literal[8, 16, 32, 64], Sequence[Literal[8, 16, 32, 64]]] = (
+ 8,
+ 16,
+ 32,
+ 64,
+ ),
) -> st.SearchStrategy["np.dtype[np.signedinteger[Any]]"]:
"""Return a strategy for signed integer dtypes.
@@ -656,11 +756,58 @@ def integer_dtypes(
return dtype_factory("i", sizes, (8, 16, 32, 64), endianness)
+@overload
+@defines_dtype_strategy
+def floating_dtypes(
+ *,
+ endianness: str = "?",
+ sizes: Literal[16],
+) -> st.SearchStrategy["np.dtype[np.float16]"]: ...
+
+
+@overload
+@defines_dtype_strategy
+def floating_dtypes(
+ *,
+ endianness: str = "?",
+ sizes: Literal[32],
+) -> st.SearchStrategy["np.dtype[np.float32]"]: ...
+
+
+@overload
+@defines_dtype_strategy
+def floating_dtypes(
+ *,
+ endianness: str = "?",
+ sizes: Literal[64],
+) -> st.SearchStrategy["np.dtype[np.float64]"]: ...
+
+
+@overload
+@defines_dtype_strategy
+def floating_dtypes(
+ *,
+ endianness: str = "?",
+ sizes: Literal[128],
+) -> st.SearchStrategy["np.dtype[np.float128]"]: ...
+
+
+@overload
@defines_dtype_strategy
def floating_dtypes(
*,
endianness: str = "?",
sizes: Sequence[Literal[16, 32, 64, 96, 128]] = (16, 32, 64),
+) -> st.SearchStrategy["np.dtype[np.floating[Any]]"]: ...
+
+
+@defines_dtype_strategy
+def floating_dtypes(
+ *,
+ endianness: str = "?",
+ sizes: Union[
+ Literal[16, 32, 64, 96, 128], Sequence[Literal[16, 32, 64, 96, 128]]
+ ] = (16, 32, 64),
) -> st.SearchStrategy["np.dtype[np.floating[Any]]"]:
"""Return a strategy for floating-point dtypes.
@@ -674,11 +821,50 @@ def floating_dtypes(
return dtype_factory("f", sizes, (16, 32, 64, 96, 128), endianness)
+@overload
+@defines_dtype_strategy
+def complex_number_dtypes(
+ *,
+ endianness: str = "?",
+ sizes: Literal[64],
+) -> st.SearchStrategy["np.dtype[np.complex64]"]: ...
+
+
+@overload
+@defines_dtype_strategy
+def complex_number_dtypes(
+ *,
+ endianness: str = "?",
+ sizes: Literal[128],
+) -> st.SearchStrategy["np.dtype[np.complex128]"]: ...
+
+
+@overload
+@defines_dtype_strategy
+def complex_number_dtypes(
+ *,
+ endianness: str = "?",
+ sizes: Literal[256],
+) -> st.SearchStrategy["np.dtype[np.complex256]"]: ...
+
+
+@overload
@defines_dtype_strategy
def complex_number_dtypes(
*,
endianness: str = "?",
sizes: Sequence[Literal[64, 128, 192, 256]] = (64, 128),
+) -> st.SearchStrategy["np.dtype[np.complexfloating[Any, Any]]"]: ...
+
+
+@defines_dtype_strategy
+def complex_number_dtypes(
+ *,
+ endianness: str = "?",
+ sizes: Union[Literal[64, 128, 192, 256], Sequence[Literal[64, 128, 192, 256]]] = (
+ 64,
+ 128,
+ ),
) -> st.SearchStrategy["np.dtype[np.complexfloating[Any, Any]]"]:
"""Return a strategy for complex-number dtypes.
diff --git a/contrib/python/hypothesis/py3/hypothesis/strategies/_internal/strategies.py b/contrib/python/hypothesis/py3/hypothesis/strategies/_internal/strategies.py
index d8d6be91aed..448f7e51ac0 100644
--- a/contrib/python/hypothesis/py3/hypothesis/strategies/_internal/strategies.py
+++ b/contrib/python/hypothesis/py3/hypothesis/strategies/_internal/strategies.py
@@ -52,12 +52,13 @@ from hypothesis.internal.reflection import (
from hypothesis.strategies._internal.utils import defines_strategy
from hypothesis.utils.conventions import UniqueIdentifier
-if sys.version_info >= (3, 13):
+# TODO: Use `(3, 13)` once Python 3.13 is released.
+if sys.version_info >= (3, 13, 0, "final"):
Ex = TypeVar("Ex", covariant=True, default=Any)
elif TYPE_CHECKING:
from typing_extensions import TypeVar # type: ignore[assignment]
- Ex = TypeVar("Ex", covariant=True, default=Any)
+ Ex = TypeVar("Ex", covariant=True, default=Any) # type: ignore[call-arg,misc]
else:
Ex = TypeVar("Ex", covariant=True)
diff --git a/contrib/python/hypothesis/py3/hypothesis/version.py b/contrib/python/hypothesis/py3/hypothesis/version.py
index 3b606ab7d4e..2b269394e55 100644
--- a/contrib/python/hypothesis/py3/hypothesis/version.py
+++ b/contrib/python/hypothesis/py3/hypothesis/version.py
@@ -8,5 +8,5 @@
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
-__version_info__ = (6, 99, 2)
+__version_info__ = (6, 99, 4)
__version__ = ".".join(map(str, __version_info__))
diff --git a/contrib/python/hypothesis/py3/ya.make b/contrib/python/hypothesis/py3/ya.make
index 0e3245c5275..d365120737c 100644
--- a/contrib/python/hypothesis/py3/ya.make
+++ b/contrib/python/hypothesis/py3/ya.make
@@ -2,7 +2,7 @@
PY3_LIBRARY()
-VERSION(6.99.2)
+VERSION(6.99.4)
LICENSE(MPL-2.0)
diff --git a/library/cpp/dot_product/README.md b/library/cpp/dot_product/README.md
new file mode 100644
index 00000000000..516dcf31de3
--- /dev/null
+++ b/library/cpp/dot_product/README.md
@@ -0,0 +1,15 @@
+Библиотека для вычисления скалярного произведения векторов.
+=====================================================
+
+Данная библиотека содержит функцию DotProduct, вычисляющую скалярное произведение векторов различных типов.
+В отличии от наивной реализации, библиотека использует SSE и работает существенно быстрее. Для сравнения
+можно посмотреть результаты бенчмарка.
+
+Типичное использование - замена кусков кода вроде:
+```
+for (int i = 0; i < len; i++)
+ dot_product += a[i] * b[i]);
+```
+на существенно более эффективный вызов ```DotProduct(a, b, len)```.
+
+Работает для типов i8, i32, float, double.
diff --git a/library/cpp/dot_product/dot_product.cpp b/library/cpp/dot_product/dot_product.cpp
new file mode 100644
index 00000000000..6be4d0a78f3
--- /dev/null
+++ b/library/cpp/dot_product/dot_product.cpp
@@ -0,0 +1,274 @@
+#include "dot_product.h"
+#include "dot_product_sse.h"
+#include "dot_product_avx2.h"
+#include "dot_product_simple.h"
+
+#include <library/cpp/sse/sse.h>
+#include <library/cpp/testing/common/env.h>
+#include <util/system/compiler.h>
+#include <util/generic/utility.h>
+#include <util/system/cpu_id.h>
+#include <util/system/env.h>
+
+namespace NDotProductImpl {
+ i32 (*DotProductI8Impl)(const i8* lhs, const i8* rhs, size_t length) noexcept = &DotProductSimple;
+ ui32 (*DotProductUi8Impl)(const ui8* lhs, const ui8* rhs, size_t length) noexcept = &DotProductSimple;
+ i64 (*DotProductI32Impl)(const i32* lhs, const i32* rhs, size_t length) noexcept = &DotProductSimple;
+ float (*DotProductFloatImpl)(const float* lhs, const float* rhs, size_t length) noexcept = &DotProductSimple;
+ double (*DotProductDoubleImpl)(const double* lhs, const double* rhs, size_t length) noexcept = &DotProductSimple;
+
+ namespace {
+ [[maybe_unused]] const int _ = [] {
+ if (!FromYaTest() && GetEnv("Y_NO_AVX_IN_DOT_PRODUCT") == "" && NX86::HaveAVX2() && NX86::HaveFMA()) {
+ DotProductI8Impl = &DotProductAvx2;
+ DotProductUi8Impl = &DotProductAvx2;
+ DotProductI32Impl = &DotProductAvx2;
+ DotProductFloatImpl = &DotProductAvx2;
+ DotProductDoubleImpl = &DotProductAvx2;
+ } else {
+#ifdef ARCADIA_SSE
+ DotProductI8Impl = &DotProductSse;
+ DotProductUi8Impl = &DotProductSse;
+ DotProductI32Impl = &DotProductSse;
+ DotProductFloatImpl = &DotProductSse;
+ DotProductDoubleImpl = &DotProductSse;
+#endif
+ }
+ return 0;
+ }();
+ }
+}
+
+#ifdef ARCADIA_SSE
+float L2NormSquared(const float* v, size_t length) noexcept {
+ __m128 sum1 = _mm_setzero_ps();
+ __m128 sum2 = _mm_setzero_ps();
+ __m128 a1, a2, m1, m2;
+
+ while (length >= 8) {
+ a1 = _mm_loadu_ps(v);
+ m1 = _mm_mul_ps(a1, a1);
+
+ a2 = _mm_loadu_ps(v + 4);
+ sum1 = _mm_add_ps(sum1, m1);
+
+ m2 = _mm_mul_ps(a2, a2);
+ sum2 = _mm_add_ps(sum2, m2);
+
+ length -= 8;
+ v += 8;
+ }
+
+ if (length >= 4) {
+ a1 = _mm_loadu_ps(v);
+ sum1 = _mm_add_ps(sum1, _mm_mul_ps(a1, a1));
+
+ length -= 4;
+ v += 4;
+ }
+
+ sum1 = _mm_add_ps(sum1, sum2);
+
+ if (length) {
+ switch (length) {
+ case 3:
+ a1 = _mm_set_ps(0.0f, v[2], v[1], v[0]);
+ break;
+
+ case 2:
+ a1 = _mm_set_ps(0.0f, 0.0f, v[1], v[0]);
+ break;
+
+ case 1:
+ a1 = _mm_set_ps(0.0f, 0.0f, 0.0f, v[0]);
+ break;
+
+ default:
+ Y_UNREACHABLE();
+ }
+
+ sum1 = _mm_add_ps(sum1, _mm_mul_ps(a1, a1));
+ }
+
+ alignas(16) float res[4];
+ _mm_store_ps(res, sum1);
+
+ return res[0] + res[1] + res[2] + res[3];
+}
+
+template <bool computeLL, bool computeLR, bool computeRR>
+Y_FORCE_INLINE
+static void TriWayDotProductIteration(__m128& sumLL, __m128& sumLR, __m128& sumRR, const __m128 a, const __m128 b) {
+ if constexpr (computeLL) {
+ sumLL = _mm_add_ps(sumLL, _mm_mul_ps(a, a));
+ }
+ if constexpr (computeLR) {
+ sumLR = _mm_add_ps(sumLR, _mm_mul_ps(a, b));
+ }
+ if constexpr (computeRR) {
+ sumRR = _mm_add_ps(sumRR, _mm_mul_ps(b, b));
+ }
+}
+
+
+template <bool computeLL, bool computeLR, bool computeRR>
+static TTriWayDotProduct<float> TriWayDotProductImpl(const float* lhs, const float* rhs, size_t length) noexcept {
+ __m128 sumLL1 = _mm_setzero_ps();
+ __m128 sumLR1 = _mm_setzero_ps();
+ __m128 sumRR1 = _mm_setzero_ps();
+ __m128 sumLL2 = _mm_setzero_ps();
+ __m128 sumLR2 = _mm_setzero_ps();
+ __m128 sumRR2 = _mm_setzero_ps();
+
+ while (length >= 8) {
+ TriWayDotProductIteration<computeLL, computeLR, computeRR>(sumLL1, sumLR1, sumRR1, _mm_loadu_ps(lhs + 0), _mm_loadu_ps(rhs + 0));
+ TriWayDotProductIteration<computeLL, computeLR, computeRR>(sumLL2, sumLR2, sumRR2, _mm_loadu_ps(lhs + 4), _mm_loadu_ps(rhs + 4));
+ length -= 8;
+ lhs += 8;
+ rhs += 8;
+ }
+
+ if (length >= 4) {
+ TriWayDotProductIteration<computeLL, computeLR, computeRR>(sumLL1, sumLR1, sumRR1, _mm_loadu_ps(lhs + 0), _mm_loadu_ps(rhs + 0));
+ length -= 4;
+ lhs += 4;
+ rhs += 4;
+ }
+
+ if constexpr (computeLL) {
+ sumLL1 = _mm_add_ps(sumLL1, sumLL2);
+ }
+ if constexpr (computeLR) {
+ sumLR1 = _mm_add_ps(sumLR1, sumLR2);
+ }
+ if constexpr (computeRR) {
+ sumRR1 = _mm_add_ps(sumRR1, sumRR2);
+ }
+
+ if (length) {
+ __m128 a, b;
+ switch (length) {
+ case 3:
+ a = _mm_set_ps(0.0f, lhs[2], lhs[1], lhs[0]);
+ b = _mm_set_ps(0.0f, rhs[2], rhs[1], rhs[0]);
+ break;
+ case 2:
+ a = _mm_set_ps(0.0f, 0.0f, lhs[1], lhs[0]);
+ b = _mm_set_ps(0.0f, 0.0f, rhs[1], rhs[0]);
+ break;
+ case 1:
+ a = _mm_set_ps(0.0f, 0.0f, 0.0f, lhs[0]);
+ b = _mm_set_ps(0.0f, 0.0f, 0.0f, rhs[0]);
+ break;
+ default:
+ Y_UNREACHABLE();
+ }
+ TriWayDotProductIteration<computeLL, computeLR, computeRR>(sumLL1, sumLR1, sumRR1, a, b);
+ }
+
+ __m128 t0 = sumLL1;
+ __m128 t1 = sumLR1;
+ __m128 t2 = sumRR1;
+ __m128 t3 = _mm_setzero_ps();
+ _MM_TRANSPOSE4_PS(t0, t1, t2, t3);
+ t0 = _mm_add_ps(t0, t1);
+ t0 = _mm_add_ps(t0, t2);
+ t0 = _mm_add_ps(t0, t3);
+
+ alignas(16) float res[4];
+ _mm_store_ps(res, t0);
+ TTriWayDotProduct<float> result{res[0], res[1], res[2]};
+ static constexpr const TTriWayDotProduct<float> def;
+ // fill skipped fields with default values
+ if constexpr (!computeLL) {
+ result.LL = def.LL;
+ }
+ if constexpr (!computeLR) {
+ result.LR = def.LR;
+ }
+ if constexpr (!computeRR) {
+ result.RR = def.RR;
+ }
+ return result;
+}
+
+
+TTriWayDotProduct<float> TriWayDotProduct(const float* lhs, const float* rhs, size_t length, unsigned mask) noexcept {
+ mask &= 0b111;
+ if (Y_LIKELY(mask == 0b111)) { // compute dot-product and length² of two vectors
+ return TriWayDotProductImpl<true, true, true>(lhs, rhs, length);
+ } else if (Y_LIKELY(mask == 0b110 || mask == 0b011)) { // compute dot-product and length² of one vector
+ const bool computeLL = (mask == 0b110);
+ if (!computeLL) {
+ DoSwap(lhs, rhs);
+ }
+ auto result = TriWayDotProductImpl<true, true, false>(lhs, rhs, length);
+ if (!computeLL) {
+ DoSwap(result.LL, result.RR);
+ }
+ return result;
+ } else {
+ // dispatch unlikely & sparse cases
+ TTriWayDotProduct<float> result{};
+ switch(mask) {
+ case 0b000:
+ break;
+ case 0b100:
+ result.LL = L2NormSquared(lhs, length);
+ break;
+ case 0b010:
+ result.LR = DotProduct(lhs, rhs, length);
+ break;
+ case 0b001:
+ result.RR = L2NormSquared(rhs, length);
+ break;
+ case 0b101:
+ result.LL = L2NormSquared(lhs, length);
+ result.RR = L2NormSquared(rhs, length);
+ break;
+ default:
+ Y_UNREACHABLE();
+ }
+ return result;
+ }
+}
+
+#else
+
+float L2NormSquared(const float* v, size_t length) noexcept {
+ return DotProduct(v, v, length);
+}
+
+TTriWayDotProduct<float> TriWayDotProduct(const float* lhs, const float* rhs, size_t length, unsigned mask) noexcept {
+ TTriWayDotProduct<float> result;
+ if (mask & static_cast<unsigned>(ETriWayDotProductComputeMask::LL)) {
+ result.LL = L2NormSquared(lhs, length);
+ }
+ if (mask & static_cast<unsigned>(ETriWayDotProductComputeMask::LR)) {
+ result.LR = DotProduct(lhs, rhs, length);
+ }
+ if (mask & static_cast<unsigned>(ETriWayDotProductComputeMask::RR)) {
+ result.RR = L2NormSquared(rhs, length);
+ }
+ return result;
+}
+
+#endif // ARCADIA_SSE
+
+namespace NDotProduct {
+ void DisableAvx2() {
+#ifdef ARCADIA_SSE
+ NDotProductImpl::DotProductI8Impl = &DotProductSse;
+ NDotProductImpl::DotProductUi8Impl = &DotProductSse;
+ NDotProductImpl::DotProductI32Impl = &DotProductSse;
+ NDotProductImpl::DotProductFloatImpl = &DotProductSse;
+ NDotProductImpl::DotProductDoubleImpl = &DotProductSse;
+#else
+ NDotProductImpl::DotProductI8Impl = &DotProductSimple;
+ NDotProductImpl::DotProductUi8Impl = &DotProductSimple;
+ NDotProductImpl::DotProductI32Impl = &DotProductSimple;
+ NDotProductImpl::DotProductFloatImpl = &DotProductSimple;
+ NDotProductImpl::DotProductDoubleImpl = &DotProductSimple;
+#endif
+ }
+}
diff --git a/library/cpp/dot_product/dot_product.h b/library/cpp/dot_product/dot_product.h
new file mode 100644
index 00000000000..0765633abdb
--- /dev/null
+++ b/library/cpp/dot_product/dot_product.h
@@ -0,0 +1,96 @@
+#pragma once
+
+#include <util/system/types.h>
+#include <util/system/compiler.h>
+
+#include <numeric>
+
+/**
+ * Dot product (Inner product or scalar product) implementation using SSE when possible.
+ */
+namespace NDotProductImpl {
+ extern i32 (*DotProductI8Impl)(const i8* lhs, const i8* rhs, size_t length) noexcept;
+ extern ui32 (*DotProductUi8Impl)(const ui8* lhs, const ui8* rhs, size_t length) noexcept;
+ extern i64 (*DotProductI32Impl)(const i32* lhs, const i32* rhs, size_t length) noexcept;
+ extern float (*DotProductFloatImpl)(const float* lhs, const float* rhs, size_t length) noexcept;
+ extern double (*DotProductDoubleImpl)(const double* lhs, const double* rhs, size_t length) noexcept;
+}
+
+Y_PURE_FUNCTION
+inline i32 DotProduct(const i8* lhs, const i8* rhs, size_t length) noexcept {
+ return NDotProductImpl::DotProductI8Impl(lhs, rhs, length);
+}
+
+Y_PURE_FUNCTION
+inline ui32 DotProduct(const ui8* lhs, const ui8* rhs, size_t length) noexcept {
+ return NDotProductImpl::DotProductUi8Impl(lhs, rhs, length);
+}
+
+Y_PURE_FUNCTION
+inline i64 DotProduct(const i32* lhs, const i32* rhs, size_t length) noexcept {
+ return NDotProductImpl::DotProductI32Impl(lhs, rhs, length);
+}
+
+Y_PURE_FUNCTION
+inline float DotProduct(const float* lhs, const float* rhs, size_t length) noexcept {
+ return NDotProductImpl::DotProductFloatImpl(lhs, rhs, length);
+}
+
+Y_PURE_FUNCTION
+inline double DotProduct(const double* lhs, const double* rhs, size_t length) noexcept {
+ return NDotProductImpl::DotProductDoubleImpl(lhs, rhs, length);
+}
+
+/**
+ * Dot product to itself
+ */
+Y_PURE_FUNCTION
+float L2NormSquared(const float* v, size_t length) noexcept;
+
+// TODO(yazevnul): make `L2NormSquared` for double, this should be faster than `DotProduct`
+// where `lhs == rhs` because it will save N load instructions.
+
+template <typename T>
+struct TTriWayDotProduct {
+ T LL = 1;
+ T LR = 0;
+ T RR = 1;
+};
+
+enum class ETriWayDotProductComputeMask: unsigned {
+ // basic
+ LL = 0b100,
+ LR = 0b010,
+ RR = 0b001,
+
+ // useful combinations
+ All = 0b111,
+ Left = 0b110, // skip computation of R·R
+ Right = 0b011, // skip computation of L·L
+};
+
+Y_PURE_FUNCTION
+TTriWayDotProduct<float> TriWayDotProduct(const float* lhs, const float* rhs, size_t length, unsigned mask) noexcept;
+
+/**
+ * For two vectors L and R computes 3 dot-products: L·L, L·R, R·R
+ */
+Y_PURE_FUNCTION
+static inline TTriWayDotProduct<float> TriWayDotProduct(const float* lhs, const float* rhs, size_t length, ETriWayDotProductComputeMask mask = ETriWayDotProductComputeMask::All) noexcept {
+ return TriWayDotProduct(lhs, rhs, length, static_cast<unsigned>(mask));
+}
+
+namespace NDotProduct {
+ // Simpler wrapper allowing to use this functions as template argument.
+ template <typename T>
+ struct TDotProduct {
+ using TResult = decltype(DotProduct(static_cast<const T*>(nullptr), static_cast<const T*>(nullptr), 0));
+ Y_PURE_FUNCTION
+ inline TResult operator()(const T* l, const T* r, size_t length) const {
+ return DotProduct(l, r, length);
+ }
+ };
+
+ void DisableAvx2();
+}
+
diff --git a/library/cpp/dot_product/dot_product_avx2.cpp b/library/cpp/dot_product/dot_product_avx2.cpp
new file mode 100644
index 00000000000..a0f7c169ee7
--- /dev/null
+++ b/library/cpp/dot_product/dot_product_avx2.cpp
@@ -0,0 +1,344 @@
+#include "dot_product_avx2.h"
+#include "dot_product_simple.h"
+#include "dot_product_sse.h"
+
+#if defined(_avx2_) && defined(_fma_)
+
+#include <util/system/platform.h>
+#include <util/system/compiler.h>
+#include <util/generic/utility.h>
+
+#include <immintrin.h>
+
+namespace {
+ constexpr i64 Bits(int n) {
+ return i64(-1) ^ ((i64(1) << (64 - n)) - 1);
+ }
+
+ constexpr __m256 BlendMask64[8] = {
+ __m256i{Bits(64), Bits(64), Bits(64), Bits(64)},
+ __m256i{0, Bits(64), Bits(64), Bits(64)},
+ __m256i{0, 0, Bits(64), Bits(64)},
+ __m256i{0, 0, 0, Bits(64)},
+ };
+
+ constexpr __m256 BlendMask32[8] = {
+ __m256i{Bits(64), Bits(64), Bits(64), Bits(64)},
+ __m256i{Bits(32), Bits(64), Bits(64), Bits(64)},
+ __m256i{0, Bits(64), Bits(64), Bits(64)},
+ __m256i{0, Bits(32), Bits(64), Bits(64)},
+ __m256i{0, 0, Bits(64), Bits(64)},
+ __m256i{0, 0, Bits(32), Bits(64)},
+ __m256i{0, 0, 0, Bits(64)},
+ __m256i{0, 0, 0, Bits(32)},
+ };
+
+ constexpr __m128 BlendMask8[16] = {
+ __m128i{Bits(64), Bits(64)},
+ __m128i{Bits(56), Bits(64)},
+ __m128i{Bits(48), Bits(64)},
+ __m128i{Bits(40), Bits(64)},
+ __m128i{Bits(32), Bits(64)},
+ __m128i{Bits(24), Bits(64)},
+ __m128i{Bits(16), Bits(64)},
+ __m128i{Bits(8), Bits(64)},
+ __m128i{0, Bits(64)},
+ __m128i{0, Bits(56)},
+ __m128i{0, Bits(48)},
+ __m128i{0, Bits(40)},
+ __m128i{0, Bits(32)},
+ __m128i{0, Bits(24)},
+ __m128i{0, Bits(16)},
+ __m128i{0, Bits(8)},
+ };
+
+ // See https://stackoverflow.com/a/60109639
+ // Horizontal sum of eight i32 values in an avx register
+ i32 HsumI32(__m256i v) {
+ __m128i x = _mm_add_epi32(_mm256_castsi256_si128(v), _mm256_extracti128_si256(v, 1));
+ __m128i hi64 = _mm_unpackhi_epi64(x, x);
+ __m128i sum64 = _mm_add_epi32(hi64, x);
+ __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+ __m128i sum32 = _mm_add_epi32(sum64, hi32);
+ return _mm_cvtsi128_si32(sum32);
+ }
+
+ // Horizontal sum of four i64 values in an avx register
+ i64 HsumI64(__m256i v) {
+ __m128i x = _mm_add_epi64(_mm256_castsi256_si128(v), _mm256_extracti128_si256(v, 1));
+ return _mm_cvtsi128_si64(x) + _mm_extract_epi64(x, 1);
+ }
+
+ // Horizontal sum of eight float values in an avx register
+ float HsumFloat(__m256 v) {
+ __m256 y = _mm256_permute2f128_ps(v, v, 1);
+ v = _mm256_add_ps(v, y);
+ v = _mm256_hadd_ps(v, v);
+ return _mm256_cvtss_f32(_mm256_hadd_ps(v, v));
+ }
+
+ // Horizontal sum of four double values in an avx register
+ double HsumDouble(__m256 v) {
+ __m128d x = _mm_add_pd(_mm256_castpd256_pd128(v), _mm256_extractf128_pd(v, 1));
+ x = _mm_add_pd(x, _mm_shuffle_pd(x, x, 1));
+ return _mm_cvtsd_f64(x);
+ }
+
+ __m128i Load128i(const void* ptr) {
+ return _mm_loadu_si128((const __m128i*)ptr);
+ }
+
+ __m256i Load256i(const void* ptr) {
+ return _mm256_loadu_si256((const __m256i*)ptr);
+ }
+
+ // Unrolled dot product for relatively small sizes
+ // The loop with known upper bound is unrolled by the compiler, no need to do anything special about it
+ template <size_t size, class TInput, class TExtend>
+ i32 DotProductInt8Avx2_Unroll(const TInput* lhs, const TInput* rhs, TExtend extend) noexcept {
+ static_assert(size % 16 == 0);
+ auto sum = _mm256_setzero_ps();
+ for (size_t i = 0; i != size; i += 16) {
+ sum = _mm256_add_epi32(sum, _mm256_madd_epi16(extend(Load128i(lhs + i)), extend(Load128i(rhs + i))));
+ }
+
+ return HsumI32(sum);
+ }
+
+ template <class TInput, class TExtend>
+ i32 DotProductInt8Avx2(const TInput* lhs, const TInput* rhs, size_t length, TExtend extend) noexcept {
+ // Fully unrolled versions for small multiples for 16
+ switch (length) {
+ case 16: return DotProductInt8Avx2_Unroll<16>(lhs, rhs, extend);
+ case 32: return DotProductInt8Avx2_Unroll<32>(lhs, rhs, extend);
+ case 48: return DotProductInt8Avx2_Unroll<48>(lhs, rhs, extend);
+ case 64: return DotProductInt8Avx2_Unroll<64>(lhs, rhs, extend);
+ }
+
+ __m256i sum = _mm256_setzero_ps();
+
+ if (const auto leftover = length % 16; leftover != 0) {
+ auto a = _mm_blendv_epi8(
+ Load128i(lhs), _mm_setzero_ps(), BlendMask8[leftover]);
+ auto b = _mm_blendv_epi8(
+ Load128i(rhs), _mm_setzero_ps(), BlendMask8[leftover]);
+
+ sum = _mm256_madd_epi16(extend(a), extend(b));
+
+ lhs += leftover;
+ rhs += leftover;
+ length -= leftover;
+ }
+
+ while (length >= 32) {
+ const auto l0 = extend(Load128i(lhs));
+ const auto r0 = extend(Load128i(rhs));
+ const auto l1 = extend(Load128i(lhs + 16));
+ const auto r1 = extend(Load128i(rhs + 16));
+
+ const auto s0 = _mm256_madd_epi16(l0, r0);
+ const auto s1 = _mm256_madd_epi16(l1, r1);
+
+ sum = _mm256_add_epi32(sum, _mm256_add_epi32(s0, s1));
+
+ lhs += 32;
+ rhs += 32;
+ length -= 32;
+ }
+
+ if (length > 0) {
+ auto l = extend(Load128i(lhs));
+ auto r = extend(Load128i(rhs));
+
+ sum = _mm256_add_epi32(sum, _mm256_madd_epi16(l, r));
+ }
+
+ return HsumI32(sum);
+ }
+}
+
+i32 DotProductAvx2(const i8* lhs, const i8* rhs, size_t length) noexcept {
+ if (length < 16) {
+ return DotProductSse(lhs, rhs, length);
+ }
+ return DotProductInt8Avx2(lhs, rhs, length, [](const __m128i x) {
+ return _mm256_cvtepi8_epi16(x);
+ });
+}
+
+ui32 DotProductAvx2(const ui8* lhs, const ui8* rhs, size_t length) noexcept {
+ if (length < 16) {
+ return DotProductSse(lhs, rhs, length);
+ }
+ return DotProductInt8Avx2(lhs, rhs, length, [](const __m128i x) {
+ return _mm256_cvtepu8_epi16(x);
+ });
+}
+
+i64 DotProductAvx2(const i32* lhs, const i32* rhs, size_t length) noexcept {
+ if (length < 16) {
+ return DotProductSse(lhs, rhs, length);
+ }
+ __m256i res = _mm256_setzero_ps();
+
+ if (const auto leftover = length % 8; leftover != 0) {
+ // Use floating-point blendv. Who cares as long as the size is right.
+ __m256i a = _mm256_blendv_ps(
+ Load256i(lhs), _mm256_setzero_ps(), BlendMask32[leftover]);
+ __m256i b = _mm256_blendv_ps(
+ Load256i(rhs), _mm256_setzero_ps(), BlendMask32[leftover]);
+
+ res = _mm256_mul_epi32(a, b);
+ a = _mm256_alignr_epi8(a, a, 4);
+ b = _mm256_alignr_epi8(b, b, 4);
+ res = _mm256_add_epi64(_mm256_mul_epi32(a, b), res);
+
+ lhs += leftover;
+ rhs += leftover;
+ length -= leftover;
+ }
+
+ while (length >= 8) {
+ __m256i a = Load256i(lhs);
+ __m256i b = Load256i(rhs);
+ res = _mm256_add_epi64(_mm256_mul_epi32(a, b), res); // This is lower parts multiplication
+ a = _mm256_alignr_epi8(a, a, 4);
+ b = _mm256_alignr_epi8(b, b, 4);
+ res = _mm256_add_epi64(_mm256_mul_epi32(a, b), res);
+ rhs += 8;
+ lhs += 8;
+ length -= 8;
+ }
+
+ return HsumI64(res);
+}
+
+float DotProductAvx2(const float* lhs, const float* rhs, size_t length) noexcept {
+ if (length < 16) {
+ return DotProductSse(lhs, rhs, length);
+ }
+ __m256 sum1 = _mm256_setzero_ps();
+ __m256 sum2 = _mm256_setzero_ps();
+ __m256 a1, b1, a2, b2;
+
+ if (const auto leftover = length % 8; leftover != 0) {
+ a1 = _mm256_blendv_ps(
+ _mm256_loadu_ps(lhs), _mm256_setzero_ps(), BlendMask32[leftover]);
+ b1 = _mm256_blendv_ps(
+ _mm256_loadu_ps(rhs), _mm256_setzero_ps(), BlendMask32[leftover]);
+ sum1 = _mm256_mul_ps(a1, b1);
+ lhs += leftover;
+ rhs += leftover;
+ length -= leftover;
+ }
+
+ while (length >= 16) {
+ a1 = _mm256_loadu_ps(lhs);
+ b1 = _mm256_loadu_ps(rhs);
+ a2 = _mm256_loadu_ps(lhs + 8);
+ b2 = _mm256_loadu_ps(rhs + 8);
+
+ sum1 = _mm256_fmadd_ps(a1, b1, sum1);
+ sum2 = _mm256_fmadd_ps(a2, b2, sum2);
+
+ length -= 16;
+ lhs += 16;
+ rhs += 16;
+ }
+
+ if (length > 0) {
+ a1 = _mm256_loadu_ps(lhs);
+ b1 = _mm256_loadu_ps(rhs);
+ sum1 = _mm256_fmadd_ps(a1, b1, sum1);
+ }
+
+ return HsumFloat(_mm256_add_ps(sum1, sum2));
+}
+
+double DotProductAvx2(const double* lhs, const double* rhs, size_t length) noexcept {
+ if (length < 16) {
+ return DotProductSse(lhs, rhs, length);
+ }
+ __m256d sum1 = _mm256_setzero_pd();
+ __m256d sum2 = _mm256_setzero_pd();
+ __m256d a1, b1, a2, b2;
+
+ if (const auto leftover = length % 4; leftover != 0) {
+ a1 = _mm256_blendv_pd(
+ _mm256_loadu_pd(lhs), _mm256_setzero_ps(), BlendMask64[leftover]);
+ b1 = _mm256_blendv_pd(
+ _mm256_loadu_pd(rhs), _mm256_setzero_ps(), BlendMask64[leftover]);
+ sum1 = _mm256_mul_pd(a1, b1);
+ lhs += leftover;
+ rhs += leftover;
+ length -= leftover;
+ }
+
+ while (length >= 8) {
+ a1 = _mm256_loadu_pd(lhs);
+ b1 = _mm256_loadu_pd(rhs);
+ a2 = _mm256_loadu_pd(lhs + 4);
+ b2 = _mm256_loadu_pd(rhs + 4);
+
+ sum1 = _mm256_fmadd_pd(a1, b1, sum1);
+ sum2 = _mm256_fmadd_pd(a2, b2, sum2);
+
+ length -= 8;
+ lhs += 8;
+ rhs += 8;
+ }
+
+ if (length > 0) {
+ a1 = _mm256_loadu_pd(lhs);
+ b1 = _mm256_loadu_pd(rhs);
+ sum1 = _mm256_fmadd_pd(a1, b1, sum1);
+ }
+
+ return HsumDouble(_mm256_add_pd(sum1, sum2));
+}
+
+#elif defined(ARCADIA_SSE)
+
+i32 DotProductAvx2(const i8* lhs, const i8* rhs, size_t length) noexcept {
+ return DotProductSse(lhs, rhs, length);
+}
+
+ui32 DotProductAvx2(const ui8* lhs, const ui8* rhs, size_t length) noexcept {
+ return DotProductSse(lhs, rhs, length);
+}
+
+i64 DotProductAvx2(const i32* lhs, const i32* rhs, size_t length) noexcept {
+ return DotProductSse(lhs, rhs, length);
+}
+
+float DotProductAvx2(const float* lhs, const float* rhs, size_t length) noexcept {
+ return DotProductSse(lhs, rhs, length);
+}
+
+double DotProductAvx2(const double* lhs, const double* rhs, size_t length) noexcept {
+ return DotProductSse(lhs, rhs, length);
+}
+
+#else
+
+i32 DotProductAvx2(const i8* lhs, const i8* rhs, size_t length) noexcept {
+ return DotProductSimple(lhs, rhs, length);
+}
+
+ui32 DotProductAvx2(const ui8* lhs, const ui8* rhs, size_t length) noexcept {
+ return DotProductSimple(lhs, rhs, length);
+}
+
+i64 DotProductAvx2(const i32* lhs, const i32* rhs, size_t length) noexcept {
+ return DotProductSimple(lhs, rhs, length);
+}
+
+float DotProductAvx2(const float* lhs, const float* rhs, size_t length) noexcept {
+ return DotProductSimple(lhs, rhs, length);
+}
+
+double DotProductAvx2(const double* lhs, const double* rhs, size_t length) noexcept {
+ return DotProductSimple(lhs, rhs, length);
+}
+
+#endif
diff --git a/library/cpp/dot_product/dot_product_avx2.h b/library/cpp/dot_product/dot_product_avx2.h
new file mode 100644
index 00000000000..715f151f448
--- /dev/null
+++ b/library/cpp/dot_product/dot_product_avx2.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <util/system/types.h>
+#include <util/system/compiler.h>
+
+Y_PURE_FUNCTION
+i32 DotProductAvx2(const i8* lhs, const i8* rhs, size_t length) noexcept;
+
+Y_PURE_FUNCTION
+ui32 DotProductAvx2(const ui8* lhs, const ui8* rhs, size_t length) noexcept;
+
+Y_PURE_FUNCTION
+i64 DotProductAvx2(const i32* lhs, const i32* rhs, size_t length) noexcept;
+
+Y_PURE_FUNCTION
+float DotProductAvx2(const float* lhs, const float* rhs, size_t length) noexcept;
+
+Y_PURE_FUNCTION
+double DotProductAvx2(const double* lhs, const double* rhs, size_t length) noexcept;
diff --git a/library/cpp/dot_product/dot_product_simple.cpp b/library/cpp/dot_product/dot_product_simple.cpp
new file mode 100644
index 00000000000..02891c8a228
--- /dev/null
+++ b/library/cpp/dot_product/dot_product_simple.cpp
@@ -0,0 +1,44 @@
+#include "dot_product_simple.h"
+
+namespace {
+ template <typename Res, typename Number>
+ static Res DotProductSimpleImpl(const Number* lhs, const Number* rhs, size_t length) noexcept {
+ Res s0 = 0;
+ Res s1 = 0;
+ Res s2 = 0;
+ Res s3 = 0;
+
+ while (length >= 4) {
+ s0 += static_cast<Res>(lhs[0]) * static_cast<Res>(rhs[0]);
+ s1 += static_cast<Res>(lhs[1]) * static_cast<Res>(rhs[1]);
+ s2 += static_cast<Res>(lhs[2]) * static_cast<Res>(rhs[2]);
+ s3 += static_cast<Res>(lhs[3]) * static_cast<Res>(rhs[3]);
+ lhs += 4;
+ rhs += 4;
+ length -= 4;
+ }
+
+ while (length--) {
+ s0 += static_cast<Res>(*lhs++) * static_cast<Res>(*rhs++);
+ }
+
+ return s0 + s1 + s2 + s3;
+ }
+}
+
+float DotProductSimple(const float* lhs, const float* rhs, size_t length) noexcept {
+ return DotProductSimpleImpl<float, float>(lhs, rhs, length);
+}
+
+double DotProductSimple(const double* lhs, const double* rhs, size_t length) noexcept {
+ return DotProductSimpleImpl<double, double>(lhs, rhs, length);
+}
+
+ui32 DotProductUI4Simple(const ui8* lhs, const ui8* rhs, size_t lengtInBytes) noexcept {
+ ui32 res = 0;
+ for (size_t i = 0; i < lengtInBytes; ++i) {
+ res += static_cast<ui32>(lhs[i] & 0x0f) * static_cast<ui32>(rhs[i] & 0x0f);
+ res += static_cast<ui32>(lhs[i] & 0xf0) * static_cast<ui32>(rhs[i] & 0xf0) >> 8;
+ }
+ return res;
+}
diff --git a/library/cpp/dot_product/dot_product_simple.h b/library/cpp/dot_product/dot_product_simple.h
new file mode 100644
index 00000000000..dd13dd7592c
--- /dev/null
+++ b/library/cpp/dot_product/dot_product_simple.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <util/system/compiler.h>
+#include <util/system/types.h>
+
+#include <numeric>
+
+/**
+ * Dot product implementation without SSE optimizations.
+ */
+Y_PURE_FUNCTION
+inline ui32 DotProductSimple(const ui8* lhs, const ui8* rhs, size_t length) noexcept {
+ return std::inner_product(lhs, lhs + length, rhs, static_cast<ui32>(0u),
+ [](ui32 x1, ui16 x2) {return x1 + x2;},
+ [](ui16 x1, ui8 x2) {return x1 * x2;});
+}
+
+Y_PURE_FUNCTION
+inline i32 DotProductSimple(const i8* lhs, const i8* rhs, size_t length) noexcept {
+ return std::inner_product(lhs, lhs + length, rhs, static_cast<i32>(0),
+ [](i32 x1, i16 x2) {return x1 + x2;},
+ [](i16 x1, i8 x2) {return x1 * x2;});
+}
+
+Y_PURE_FUNCTION
+inline i64 DotProductSimple(const i32* lhs, const i32* rhs, size_t length) noexcept {
+ return std::inner_product(lhs, lhs + length, rhs, static_cast<i64>(0),
+ [](i64 x1, i64 x2) {return x1 + x2;},
+ [](i64 x1, i32 x2) {return x1 * x2;});
+}
+
+Y_PURE_FUNCTION
+float DotProductSimple(const float* lhs, const float* rhs, size_t length) noexcept;
+
+Y_PURE_FUNCTION
+double DotProductSimple(const double* lhs, const double* rhs, size_t length) noexcept;
+
+Y_PURE_FUNCTION
+ui32 DotProductUI4Simple(const ui8* lhs, const ui8* rhs, size_t lengtInBytes) noexcept;
+
diff --git a/library/cpp/dot_product/dot_product_sse.cpp b/library/cpp/dot_product/dot_product_sse.cpp
new file mode 100644
index 00000000000..5256cfe98af
--- /dev/null
+++ b/library/cpp/dot_product/dot_product_sse.cpp
@@ -0,0 +1,219 @@
+#include "dot_product_sse.h"
+
+#include <library/cpp/sse/sse.h>
+#include <util/system/platform.h>
+#include <util/system/compiler.h>
+
+#ifdef ARCADIA_SSE
+i32 DotProductSse(const i8* lhs, const i8* rhs, size_t length) noexcept {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i resVec = zero;
+ while (length >= 16) {
+ __m128i lVec = _mm_loadu_si128((const __m128i*)lhs);
+ __m128i rVec = _mm_loadu_si128((const __m128i*)rhs);
+
+#ifdef _sse4_1_
+ __m128i lLo = _mm_cvtepi8_epi16(lVec);
+ __m128i rLo = _mm_cvtepi8_epi16(rVec);
+ __m128i lHi = _mm_cvtepi8_epi16(_mm_alignr_epi8(lVec, lVec, 8));
+ __m128i rHi = _mm_cvtepi8_epi16(_mm_alignr_epi8(rVec, rVec, 8));
+#else
+ __m128i lLo = _mm_srai_epi16(_mm_unpacklo_epi8(zero, lVec), 8);
+ __m128i rLo = _mm_srai_epi16(_mm_unpacklo_epi8(zero, rVec), 8);
+ __m128i lHi = _mm_srai_epi16(_mm_unpackhi_epi8(zero, lVec), 8);
+ __m128i rHi = _mm_srai_epi16(_mm_unpackhi_epi8(zero, rVec), 8);
+#endif
+ resVec = _mm_add_epi32(resVec,
+ _mm_add_epi32(_mm_madd_epi16(lLo, rLo), _mm_madd_epi16(lHi, rHi)));
+
+ lhs += 16;
+ rhs += 16;
+ length -= 16;
+ }
+
+ alignas(16) i32 res[4];
+ _mm_store_si128((__m128i*)res, resVec);
+ i32 sum = res[0] + res[1] + res[2] + res[3];
+ for (size_t i = 0; i < length; ++i) {
+ sum += static_cast<i32>(lhs[i]) * static_cast<i32>(rhs[i]);
+ }
+
+ return sum;
+}
+
+ui32 DotProductSse(const ui8* lhs, const ui8* rhs, size_t length) noexcept {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i resVec = zero;
+ while (length >= 16) {
+ __m128i lVec = _mm_loadu_si128((const __m128i*)lhs);
+ __m128i rVec = _mm_loadu_si128((const __m128i*)rhs);
+
+ __m128i lLo = _mm_unpacklo_epi8(lVec, zero);
+ __m128i rLo = _mm_unpacklo_epi8(rVec, zero);
+ __m128i lHi = _mm_unpackhi_epi8(lVec, zero);
+ __m128i rHi = _mm_unpackhi_epi8(rVec, zero);
+
+ resVec = _mm_add_epi32(resVec,
+ _mm_add_epi32(_mm_madd_epi16(lLo, rLo), _mm_madd_epi16(lHi, rHi)));
+
+ lhs += 16;
+ rhs += 16;
+ length -= 16;
+ }
+
+ alignas(16) i32 res[4];
+ _mm_store_si128((__m128i*)res, resVec);
+ i32 sum = res[0] + res[1] + res[2] + res[3];
+ for (size_t i = 0; i < length; ++i) {
+ sum += static_cast<i32>(lhs[i]) * static_cast<i32>(rhs[i]);
+ }
+
+ return static_cast<ui32>(sum);
+}
+#ifdef _sse4_1_
+
+i64 DotProductSse(const i32* lhs, const i32* rhs, size_t length) noexcept {
+ __m128i zero = _mm_setzero_si128();
+ __m128i res = zero;
+
+ while (length >= 4) {
+ __m128i a = _mm_loadu_si128((const __m128i*)lhs);
+ __m128i b = _mm_loadu_si128((const __m128i*)rhs);
+ res = _mm_add_epi64(_mm_mul_epi32(a, b), res); // This is lower parts multiplication
+ a = _mm_alignr_epi8(a, a, 4);
+ b = _mm_alignr_epi8(b, b, 4);
+ res = _mm_add_epi64(_mm_mul_epi32(a, b), res);
+ rhs += 4;
+ lhs += 4;
+ length -= 4;
+ }
+
+ alignas(16) i64 r[2];
+ _mm_store_si128((__m128i*)r, res);
+ i64 sum = r[0] + r[1];
+
+ for (size_t i = 0; i < length; ++i) {
+ sum += static_cast<i64>(lhs[i]) * static_cast<i64>(rhs[i]);
+ }
+
+ return sum;
+}
+
+#else
+#include "dot_product_simple.h"
+
+i64 DotProductSse(const i32* lhs, const i32* rhs, size_t length) noexcept {
+ return DotProductSimple(lhs, rhs, length);
+}
+
+#endif
+
+float DotProductSse(const float* lhs, const float* rhs, size_t length) noexcept {
+ __m128 sum1 = _mm_setzero_ps();
+ __m128 sum2 = _mm_setzero_ps();
+ __m128 a1, b1, a2, b2, m1, m2;
+
+ while (length >= 8) {
+ a1 = _mm_loadu_ps(lhs);
+ b1 = _mm_loadu_ps(rhs);
+ m1 = _mm_mul_ps(a1, b1);
+
+ a2 = _mm_loadu_ps(lhs + 4);
+ sum1 = _mm_add_ps(sum1, m1);
+
+ b2 = _mm_loadu_ps(rhs + 4);
+ m2 = _mm_mul_ps(a2, b2);
+
+ sum2 = _mm_add_ps(sum2, m2);
+
+ length -= 8;
+ lhs += 8;
+ rhs += 8;
+ }
+
+ if (length >= 4) {
+ a1 = _mm_loadu_ps(lhs);
+ b1 = _mm_loadu_ps(rhs);
+ sum1 = _mm_add_ps(sum1, _mm_mul_ps(a1, b1));
+
+ length -= 4;
+ lhs += 4;
+ rhs += 4;
+ }
+
+ sum1 = _mm_add_ps(sum1, sum2);
+
+ if (length) {
+ switch (length) {
+ case 3:
+ a1 = _mm_set_ps(0.0f, lhs[2], lhs[1], lhs[0]);
+ b1 = _mm_set_ps(0.0f, rhs[2], rhs[1], rhs[0]);
+ break;
+
+ case 2:
+ a1 = _mm_set_ps(0.0f, 0.0f, lhs[1], lhs[0]);
+ b1 = _mm_set_ps(0.0f, 0.0f, rhs[1], rhs[0]);
+ break;
+
+ case 1:
+ a1 = _mm_set_ps(0.0f, 0.0f, 0.0f, lhs[0]);
+ b1 = _mm_set_ps(0.0f, 0.0f, 0.0f, rhs[0]);
+ break;
+
+ default:
+ Y_UNREACHABLE();
+ }
+
+ sum1 = _mm_add_ps(sum1, _mm_mul_ps(a1, b1));
+ }
+
+ alignas(16) float res[4];
+ _mm_store_ps(res, sum1);
+
+ return res[0] + res[1] + res[2] + res[3];
+}
+
+double DotProductSse(const double* lhs, const double* rhs, size_t length) noexcept {
+ __m128d sum1 = _mm_setzero_pd();
+ __m128d sum2 = _mm_setzero_pd();
+ __m128d a1, b1, a2, b2;
+
+ while (length >= 4) {
+ a1 = _mm_loadu_pd(lhs);
+ b1 = _mm_loadu_pd(rhs);
+ sum1 = _mm_add_pd(sum1, _mm_mul_pd(a1, b1));
+
+ a2 = _mm_loadu_pd(lhs + 2);
+ b2 = _mm_loadu_pd(rhs + 2);
+ sum2 = _mm_add_pd(sum2, _mm_mul_pd(a2, b2));
+
+ length -= 4;
+ lhs += 4;
+ rhs += 4;
+ }
+
+ if (length >= 2) {
+ a1 = _mm_loadu_pd(lhs);
+ b1 = _mm_loadu_pd(rhs);
+ sum1 = _mm_add_pd(sum1, _mm_mul_pd(a1, b1));
+
+ length -= 2;
+ lhs += 2;
+ rhs += 2;
+ }
+
+ sum1 = _mm_add_pd(sum1, sum2);
+
+ if (length > 0) {
+ a1 = _mm_set_pd(lhs[0], 0.0);
+ b1 = _mm_set_pd(rhs[0], 0.0);
+ sum1 = _mm_add_pd(sum1, _mm_mul_pd(a1, b1));
+ }
+
+ alignas(16) double res[2];
+ _mm_store_pd(res, sum1);
+
+ return res[0] + res[1];
+}
+
+#endif // ARCADIA_SSE
diff --git a/library/cpp/dot_product/dot_product_sse.h b/library/cpp/dot_product/dot_product_sse.h
new file mode 100644
index 00000000000..814736007d0
--- /dev/null
+++ b/library/cpp/dot_product/dot_product_sse.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <util/system/types.h>
+#include <util/system/compiler.h>
+
+Y_PURE_FUNCTION
+i32 DotProductSse(const i8* lhs, const i8* rhs, size_t length) noexcept;
+
+Y_PURE_FUNCTION
+ui32 DotProductSse(const ui8* lhs, const ui8* rhs, size_t length) noexcept;
+
+Y_PURE_FUNCTION
+i64 DotProductSse(const i32* lhs, const i32* rhs, size_t length) noexcept;
+
+Y_PURE_FUNCTION
+float DotProductSse(const float* lhs, const float* rhs, size_t length) noexcept;
+
+Y_PURE_FUNCTION
+double DotProductSse(const double* lhs, const double* rhs, size_t length) noexcept;
diff --git a/library/cpp/dot_product/ya.make b/library/cpp/dot_product/ya.make
new file mode 100644
index 00000000000..b308967b4be
--- /dev/null
+++ b/library/cpp/dot_product/ya.make
@@ -0,0 +1,20 @@
+LIBRARY()
+
+SRCS(
+ dot_product.cpp
+ dot_product_sse.cpp
+ dot_product_simple.cpp
+)
+
+IF (USE_SSE4 == "yes" AND OS_LINUX == "yes")
+ SRC_C_AVX2(dot_product_avx2.cpp -mfma)
+ELSE()
+ SRC(dot_product_avx2.cpp)
+ENDIF()
+
+PEERDIR(
+ library/cpp/sse
+ library/cpp/testing/common
+)
+
+END()
diff --git a/library/cpp/tld/tlds-alpha-by-domain.txt b/library/cpp/tld/tlds-alpha-by-domain.txt
index f723a347349..a9ad25f62a2 100644
--- a/library/cpp/tld/tlds-alpha-by-domain.txt
+++ b/library/cpp/tld/tlds-alpha-by-domain.txt
@@ -1,4 +1,4 @@
-# Version 2024032200, Last Updated Fri Mar 22 07:07:01 2024 UTC
+# Version 2024032400, Last Updated Sun Mar 24 07:07:02 2024 UTC
AAA
AARP
ABB
diff --git a/ya b/ya
index ea638f295f6..dce09e129f3 100755
--- a/ya
+++ b/ya
@@ -39,33 +39,33 @@ REGISTRY_ENDPOINT = os.environ.get("YA_REGISTRY_ENDPOINT", "https://devtools-reg
PLATFORM_MAP = {
"data": {
"darwin": {
- "md5": "07caee915ae4e1e6fedb9ff9998352f7",
+ "md5": "7ef94d3d94718cee2994ba26ddbd1e9b",
"urls": [
- f"{REGISTRY_ENDPOINT}/6033068511"
+ f"{REGISTRY_ENDPOINT}/6052444589"
]
},
"darwin-arm64": {
- "md5": "4bfc77ab0b0da49f22059c953128e800",
+ "md5": "8b8318e6ac887bd6d10767e77a98f11c",
"urls": [
- f"{REGISTRY_ENDPOINT}/6033067202"
+ f"{REGISTRY_ENDPOINT}/6052442462"
]
},
"linux-aarch64": {
- "md5": "18972f2c4b41f1907b66de1ba85b2185",
+ "md5": "96c0f21d7d46d4c35e13e840f7d61c17",
"urls": [
- f"{REGISTRY_ENDPOINT}/6033065936"
+ f"{REGISTRY_ENDPOINT}/6052440417"
]
},
"win32-clang-cl": {
- "md5": "23bea185276aed4d4c4ccd7238fbaa60",
+ "md5": "af6764b934bd28f11a2f19497bb25c56",
"urls": [
- f"{REGISTRY_ENDPOINT}/6033069998"
+ f"{REGISTRY_ENDPOINT}/6052446212"
]
},
"linux": {
- "md5": "94c9f8f880c86b67c5f780f57a12c325",
+ "md5": "d45402f8465e12e5009aa9fdc1221830",
"urls": [
- f"{REGISTRY_ENDPOINT}/6033071738"
+ f"{REGISTRY_ENDPOINT}/6052447833"
]
}
}
diff --git a/yt/yt/core/actions/future-inl.h b/yt/yt/core/actions/future-inl.h
index 369d6f57fee..3b82586642f 100644
--- a/yt/yt/core/actions/future-inl.h
+++ b/yt/yt/core/actions/future-inl.h
@@ -860,7 +860,11 @@ TFuture<R> ApplyUniqueHelper(TFutureBase<T> this_, TCallback<S> callback)
}
template <class T, class D>
-TFuture<T> ApplyTimeoutHelper(TFutureBase<T> this_, D timeoutOrDeadline, IInvokerPtr invoker)
+TFuture<T> ApplyTimeoutHelper(
+ TFutureBase<T> this_,
+ D timeoutOrDeadline,
+ TFutureTimeoutOptions options,
+ IInvokerPtr invoker)
{
auto promise = NewPromise<T>();
@@ -878,6 +882,9 @@ TFuture<T> ApplyTimeoutHelper(TFutureBase<T> this_, D timeoutOrDeadline, IInvoke
error = error << NYT::TErrorAttribute("deadline", timeoutOrDeadline);
}
}
+ if (!options.Error.IsOK()) {
+ error = options.Error << std::move(error);
+ }
promise.TrySet(error);
cancelable.Cancel(error);
}),
@@ -1125,7 +1132,10 @@ TFuture<T> TFutureBase<T>::ToImmediatelyCancelable() const
}
template <class T>
-TFuture<T> TFutureBase<T>::WithDeadline(TInstant deadline, IInvokerPtr invoker) const
+TFuture<T> TFutureBase<T>::WithDeadline(
+ TInstant deadline,
+ TFutureTimeoutOptions options,
+ IInvokerPtr invoker) const
{
YT_ASSERT(Impl_);
@@ -1133,11 +1143,14 @@ TFuture<T> TFutureBase<T>::WithDeadline(TInstant deadline, IInvokerPtr invoker)
return TFuture<T>(Impl_);
}
- return NYT::NDetail::ApplyTimeoutHelper(*this, deadline, std::move(invoker));
+ return NYT::NDetail::ApplyTimeoutHelper(*this, deadline, std::move(options), std::move(invoker));
}
template <class T>
-TFuture<T> TFutureBase<T>::WithTimeout(TDuration timeout, IInvokerPtr invoker) const
+TFuture<T> TFutureBase<T>::WithTimeout(
+ TDuration timeout,
+ TFutureTimeoutOptions options,
+ IInvokerPtr invoker) const
{
YT_ASSERT(Impl_);
@@ -1145,15 +1158,16 @@ TFuture<T> TFutureBase<T>::WithTimeout(TDuration timeout, IInvokerPtr invoker) c
return TFuture<T>(Impl_);
}
- return NYT::NDetail::ApplyTimeoutHelper(*this, timeout, std::move(invoker));
+ return NYT::NDetail::ApplyTimeoutHelper(*this, timeout, std::move(options), std::move(invoker));
}
template <class T>
TFuture<T> TFutureBase<T>::WithTimeout(
std::optional<TDuration> timeout,
+ TFutureTimeoutOptions options,
IInvokerPtr invoker) const
{
- return timeout ? WithTimeout(*timeout, std::move(invoker)) : TFuture<T>(Impl_);
+ return timeout ? WithTimeout(*timeout, std::move(options), std::move(invoker)) : TFuture<T>(Impl_);
}
template <class T>
diff --git a/yt/yt/core/actions/future.h b/yt/yt/core/actions/future.h
index 6656dd47e94..4baadc2f8cf 100644
--- a/yt/yt/core/actions/future.h
+++ b/yt/yt/core/actions/future.h
@@ -164,6 +164,15 @@ constexpr TFutureCallbackCookie NullFutureCallbackCookie = -1;
////////////////////////////////////////////////////////////////////////////////
+struct TFutureTimeoutOptions
+{
+ //! If set to a non-trivial error, timeout or cancelation errors
+ //! are enveloped into this error.
+ TError Error;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
//! A base class for both TFuture<T> and its specialization TFuture<void>.
/*!
* The resulting value can be accessed by either subscribing (#Subscribe)
@@ -275,13 +284,22 @@ public:
//! Returns a future that is either set to an actual value (if the original one is set in timely manner)
//! or to |EErrorCode::Timeout| (in case the deadline is reached).
//! The timeout event is handled in #invoker (DelayedExecutor is null).
- TFuture<T> WithDeadline(TInstant deadline, IInvokerPtr invoker = nullptr) const;
+ TFuture<T> WithDeadline(
+ TInstant deadline,
+ TFutureTimeoutOptions options = {},
+ IInvokerPtr invoker = nullptr) const;
//! Returns a future that is either set to an actual value (if the original one is set in timely manner)
//! or to |EErrorCode::Timeout| (in case of timeout).
//! The timeout event is handled in #invoker (DelayedExecutor is null).
- TFuture<T> WithTimeout(TDuration timeout, IInvokerPtr invoker = nullptr) const;
- TFuture<T> WithTimeout(std::optional<TDuration> timeout, IInvokerPtr invoker = nullptr) const;
+ TFuture<T> WithTimeout(
+ TDuration timeout,
+ TFutureTimeoutOptions options = {},
+ IInvokerPtr invoker = nullptr) const;
+ TFuture<T> WithTimeout(
+ std::optional<TDuration> timeout,
+ TFutureTimeoutOptions options = {},
+ IInvokerPtr invoker = nullptr) const;
//! Chains the asynchronous computation with another one.
template <class R>
diff --git a/yt/yt/core/rpc/roaming_channel.cpp b/yt/yt/core/rpc/roaming_channel.cpp
index 47c9767c8fc..2c2bccc7337 100644
--- a/yt/yt/core/rpc/roaming_channel.cpp
+++ b/yt/yt/core/rpc/roaming_channel.cpp
@@ -27,7 +27,9 @@ public:
, StartTime_(TInstant::Now())
{
if (Options_.Timeout) {
- asyncChannel = asyncChannel.WithTimeout(*Options_.Timeout);
+ asyncChannel = asyncChannel.WithTimeout(*Options_.Timeout, TFutureTimeoutOptions{
+ .Error = TError("Error getting channel")
+ });
}
asyncChannel.Subscribe(BIND(&TRoamingRequestControl::OnGotChannel, MakeStrong(this)));
diff --git a/yt/yt/core/ytree/ypath_client.cpp b/yt/yt/core/ytree/ypath_client.cpp
index 3185b9ec107..d01b4a7859a 100644
--- a/yt/yt/core/ytree/ypath_client.cpp
+++ b/yt/yt/core/ytree/ypath_client.cpp
@@ -389,7 +389,7 @@ void ExecuteVerb(
SetRequestTargetYPath(requestHeader.get(), suffixPath);
context->SetRequestHeader(std::move(requestHeader));
- // This should never throw or yield.
+ // This should never throw.
suffixService->Invoke(context);
}