intermediate changes

ref:cde9a383711a11544ce7e107a78147fb96cc4029
author: Devtools Arcadia <arcadia-devtools@yandex-team.ru> 2022-02-07 18:08:42 +0300
committer: Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> 2022-02-07 18:08:42 +0300
commit: 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
tree: e26c9fed0de5d9873cce7e00bc214573dc2195b7 /contrib/libs/tbb/include
download: ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
129 files changed, 32583 insertions, 0 deletions
diff --git a/contrib/libs/tbb/include/oneapi/tbb.h b/contrib/libs/tbb/include/oneapi/tbb.h
new file mode 100644
index 0000000000..1ca41dc516
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb.h
@@ -0,0 +1,73 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_tbb_H
+#define __TBB_tbb_H
+
+/**
+    This header bulk-includes declarations or definitions of all the functionality
+    provided by TBB (save for tbbmalloc and 3rd party dependent headers).
+
+    If you use only a few TBB constructs, consider including specific headers only.
+    Any header listed below can be included independently of others.
+**/
+
+#include "oneapi/tbb/blocked_range.h"
+#include "oneapi/tbb/blocked_range2d.h"
+#include "oneapi/tbb/blocked_range3d.h"
+#if TBB_PREVIEW_BLOCKED_RANGE_ND
+#include "tbb/blocked_rangeNd.h"
+#endif
+#include "oneapi/tbb/cache_aligned_allocator.h"
+#include "oneapi/tbb/combinable.h"
+#include "oneapi/tbb/concurrent_hash_map.h"
+#if TBB_PREVIEW_CONCURRENT_LRU_CACHE
+#include "tbb/concurrent_lru_cache.h"
+#endif
+#include "oneapi/tbb/concurrent_priority_queue.h"
+#include "oneapi/tbb/concurrent_queue.h"
+#include "oneapi/tbb/concurrent_unordered_map.h"
+#include "oneapi/tbb/concurrent_unordered_set.h"
+#include "oneapi/tbb/concurrent_map.h"
+#include "oneapi/tbb/concurrent_set.h"
+#include "oneapi/tbb/concurrent_vector.h"
+#include "oneapi/tbb/enumerable_thread_specific.h"
+#include "oneapi/tbb/flow_graph.h"
+#include "oneapi/tbb/global_control.h"
+#include "oneapi/tbb/info.h"
+#include "oneapi/tbb/null_mutex.h"
+#include "oneapi/tbb/null_rw_mutex.h"
+#include "oneapi/tbb/parallel_for.h"
+#include "oneapi/tbb/parallel_for_each.h"
+#include "oneapi/tbb/parallel_invoke.h"
+#include "oneapi/tbb/parallel_pipeline.h"
+#include "oneapi/tbb/parallel_reduce.h"
+#include "oneapi/tbb/parallel_scan.h"
+#include "oneapi/tbb/parallel_sort.h"
+#include "oneapi/tbb/partitioner.h"
+#include "oneapi/tbb/queuing_mutex.h"
+#include "oneapi/tbb/queuing_rw_mutex.h"
+#include "oneapi/tbb/spin_mutex.h"
+#include "oneapi/tbb/spin_rw_mutex.h"
+#include "oneapi/tbb/task.h"
+#include "oneapi/tbb/task_arena.h"
+#include "oneapi/tbb/task_group.h"
+#include "oneapi/tbb/task_scheduler_observer.h"
+#include "oneapi/tbb/tbb_allocator.h"
+#include "oneapi/tbb/tick_count.h"
+#include "oneapi/tbb/version.h"
+
+#endif /* __TBB_tbb_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/blocked_range.h b/contrib/libs/tbb/include/oneapi/tbb/blocked_range.h
new file mode 100644
index 0000000000..f6612fb4e3
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/blocked_range.h
@@ -0,0 +1,163 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_blocked_range_H
+#define __TBB_blocked_range_H
+
+#include <cstddef>
+
+#include "detail/_range_common.h"
+#include "detail/_namespace_injection.h"
+
+#include "version.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+/** \page range_req Requirements on range concept
+    Class \c R implementing the concept of range must define:
+    - \code R::R( const R& ); \endcode               Copy constructor
+    - \code R::~R(); \endcode                        Destructor
+    - \code bool R::is_divisible() const; \endcode   True if range can be partitioned into two subranges
+    - \code bool R::empty() const; \endcode          True if range is empty
+    - \code R::R( R& r, split ); \endcode            Split range \c r into two subranges.
+**/
+
+//! A range over which to iterate.
+/** @ingroup algorithms */
+template<typename Value>
+class blocked_range {
+public:
+    //! Type of a value
+    /** Called a const_iterator for sake of algorithms that need to treat a blocked_range
+        as an STL container. */
+    using const_iterator = Value;
+
+    //! Type for size of a range
+    using size_type = std::size_t;
+
+    //! Construct range over half-open interval [begin,end), with the given grainsize.
+    blocked_range( Value begin_, Value end_, size_type grainsize_=1 ) :
+        my_end(end_), my_begin(begin_), my_grainsize(grainsize_)
+    {
+        __TBB_ASSERT( my_grainsize>0, "grainsize must be positive" );
+    }
+
+    //! Beginning of range.
+    const_iterator begin() const { return my_begin; }
+
+    //! One past last value in range.
+    const_iterator end() const { return my_end; }
+
+    //! Size of the range
+    /** Unspecified if end()<begin(). */
+    size_type size() const {
+        __TBB_ASSERT( !(end()<begin()), "size() unspecified if end()<begin()" );
+        return size_type(my_end-my_begin);
+    }
+
+    //! The grain size for this range.
+    size_type grainsize() const { return my_grainsize; }
+
+    //------------------------------------------------------------------------
+    // Methods that implement Range concept
+    //------------------------------------------------------------------------
+
+    //! True if range is empty.
+    bool empty() const { return !(my_begin<my_end); }
+
+    //! True if range is divisible.
+    /** Unspecified if end()<begin(). */
+    bool is_divisible() const { return my_grainsize<size(); }
+
+    //! Split range.
+    /** The new Range *this has the second part, the old range r has the first part.
+        Unspecified if end()<begin() or !is_divisible(). */
+    blocked_range( blocked_range& r, split ) :
+        my_end(r.my_end),
+        my_begin(do_split(r, split())),
+        my_grainsize(r.my_grainsize)
+    {
+        // only comparison 'less than' is required from values of blocked_range objects
+        __TBB_ASSERT( !(my_begin < r.my_end) && !(r.my_end < my_begin), "blocked_range has been split incorrectly" );
+    }
+
+    //! Split range.
+    /** The new Range *this has the second part split according to specified proportion, the old range r has the first part.
+        Unspecified if end()<begin() or !is_divisible(). */
+    blocked_range( blocked_range& r, proportional_split& proportion ) :
+        my_end(r.my_end),
+        my_begin(do_split(r, proportion)),
+        my_grainsize(r.my_grainsize)
+    {
+        // only comparison 'less than' is required from values of blocked_range objects
+        __TBB_ASSERT( !(my_begin < r.my_end) && !(r.my_end < my_begin), "blocked_range has been split incorrectly" );
+    }
+
+private:
+    /** NOTE: my_end MUST be declared before my_begin, otherwise the splitting constructor will break. */
+    Value my_end;
+    Value my_begin;
+    size_type my_grainsize;
+
+    //! Auxiliary function used by the splitting constructor.
+    static Value do_split( blocked_range& r, split )
+    {
+        __TBB_ASSERT( r.is_divisible(), "cannot split blocked_range that is not divisible" );
+        Value middle = r.my_begin + (r.my_end - r.my_begin) / 2u;
+        r.my_end = middle;
+        return middle;
+    }
+
+    static Value do_split( blocked_range& r, proportional_split& proportion )
+    {
+        __TBB_ASSERT( r.is_divisible(), "cannot split blocked_range that is not divisible" );
+
+        // usage of 32-bit floating point arithmetic is not enough to handle ranges of
+        // more than 2^24 iterations accurately. However, even on ranges with 2^64
+        // iterations the computational error approximately equals to 0.000001% which
+        // makes small impact on uniform distribution of such range's iterations (assuming
+        // all iterations take equal time to complete). See 'test_partitioner_whitebox'
+        // for implementation of an exact split algorithm
+        size_type right_part = size_type(float(r.size()) * float(proportion.right())
+                                         / float(proportion.left() + proportion.right()) + 0.5f);
+        return r.my_end = Value(r.my_end - right_part);
+    }
+
+    template<typename RowValue, typename ColValue>
+    friend class blocked_range2d;
+
+    template<typename RowValue, typename ColValue, typename PageValue>
+    friend class blocked_range3d;
+
+    template<typename DimValue, unsigned int N, typename>
+    friend class blocked_rangeNd_impl;
+};
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::blocked_range;
+// Split types
+using detail::split;
+using detail::proportional_split;
+} // namespace v1
+
+} // namespace tbb
+
+#endif /* __TBB_blocked_range_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/blocked_range2d.h b/contrib/libs/tbb/include/oneapi/tbb/blocked_range2d.h
new file mode 100644
index 0000000000..01ed17d859
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/blocked_range2d.h
@@ -0,0 +1,108 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_blocked_range2d_H
+#define __TBB_blocked_range2d_H
+
+#include <cstddef>
+
+#include "detail/_config.h"
+#include "detail/_namespace_injection.h"
+
+#include "blocked_range.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! A 2-dimensional range that models the Range concept.
+/** @ingroup algorithms */
+template<typename RowValue, typename ColValue = RowValue>
+class blocked_range2d {
+public:
+    //! Type for size of an iteration range
+    using row_range_type = blocked_range<RowValue>;
+    using col_range_type = blocked_range<ColValue>;
+
+private:
+    row_range_type my_rows;
+    col_range_type my_cols;
+
+public:
+    blocked_range2d( RowValue row_begin, RowValue row_end, typename row_range_type::size_type row_grainsize,
+                     ColValue col_begin, ColValue col_end, typename col_range_type::size_type col_grainsize ) :
+        my_rows(row_begin,row_end,row_grainsize),
+        my_cols(col_begin,col_end,col_grainsize)
+    {}
+
+    blocked_range2d( RowValue row_begin, RowValue row_end,
+                     ColValue col_begin, ColValue col_end ) :
+        my_rows(row_begin,row_end),
+        my_cols(col_begin,col_end)
+    {}
+
+    //! True if range is empty
+    bool empty() const {
+        // Range is empty if at least one dimension is empty.
+        return my_rows.empty() || my_cols.empty();
+    }
+
+    //! True if range is divisible into two pieces.
+    bool is_divisible() const {
+        return my_rows.is_divisible() || my_cols.is_divisible();
+    }
+
+    blocked_range2d( blocked_range2d& r, split ) :
+        my_rows(r.my_rows),
+        my_cols(r.my_cols)
+    {
+        split split_obj;
+        do_split(r, split_obj);
+    }
+
+    blocked_range2d( blocked_range2d& r, proportional_split& proportion ) :
+        my_rows(r.my_rows),
+        my_cols(r.my_cols)
+    {
+        do_split(r, proportion);
+    }
+
+    //! The rows of the iteration space
+    const row_range_type& rows() const { return my_rows; }
+
+    //! The columns of the iteration space
+    const col_range_type& cols() const { return my_cols; }
+
+private:
+    template <typename Split>
+    void do_split( blocked_range2d& r, Split& split_obj ) {
+        if ( my_rows.size()*double(my_cols.grainsize()) < my_cols.size()*double(my_rows.grainsize()) ) {
+            my_cols.my_begin = col_range_type::do_split(r.my_cols, split_obj);
+        } else {
+            my_rows.my_begin = row_range_type::do_split(r.my_rows, split_obj);
+        }
+    }
+};
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::blocked_range2d;
+} // namespace v1
+} // namespace tbb
+
+#endif /* __TBB_blocked_range2d_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/blocked_range3d.h b/contrib/libs/tbb/include/oneapi/tbb/blocked_range3d.h
new file mode 100644
index 0000000000..d4178050a8
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/blocked_range3d.h
@@ -0,0 +1,127 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_blocked_range3d_H
+#define __TBB_blocked_range3d_H
+
+#include <cstddef>
+
+#include "detail/_config.h"
+#include "detail/_namespace_injection.h"
+
+#include "blocked_range.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! A 3-dimensional range that models the Range concept.
+/** @ingroup algorithms */
+template<typename PageValue, typename RowValue = PageValue, typename ColValue = RowValue>
+class blocked_range3d {
+public:
+    //! Type for size of an iteration range
+    using page_range_type = blocked_range<PageValue>;
+    using row_range_type = blocked_range<RowValue>;
+    using col_range_type = blocked_range<ColValue>;
+
+private:
+    page_range_type my_pages;
+    row_range_type  my_rows;
+    col_range_type  my_cols;
+
+public:
+
+    blocked_range3d( PageValue page_begin, PageValue page_end,
+                     RowValue  row_begin,  RowValue row_end,
+                     ColValue  col_begin,  ColValue col_end ) :
+        my_pages(page_begin,page_end),
+        my_rows(row_begin,row_end),
+        my_cols(col_begin,col_end)
+    {}
+
+    blocked_range3d( PageValue page_begin, PageValue page_end, typename page_range_type::size_type page_grainsize,
+                     RowValue  row_begin,  RowValue row_end,   typename row_range_type::size_type row_grainsize,
+                     ColValue  col_begin,  ColValue col_end,   typename col_range_type::size_type col_grainsize ) :
+        my_pages(page_begin,page_end,page_grainsize),
+        my_rows(row_begin,row_end,row_grainsize),
+        my_cols(col_begin,col_end,col_grainsize)
+    {}
+
+    //! True if range is empty
+    bool empty() const {
+        // Range is empty if at least one dimension is empty.
+        return my_pages.empty() || my_rows.empty() || my_cols.empty();
+    }
+
+    //! True if range is divisible into two pieces.
+    bool is_divisible() const {
+        return  my_pages.is_divisible() || my_rows.is_divisible() || my_cols.is_divisible();
+    }
+
+    blocked_range3d( blocked_range3d& r, split split_obj ) :
+        my_pages(r.my_pages),
+        my_rows(r.my_rows),
+        my_cols(r.my_cols)
+    {
+        do_split(r, split_obj);
+    }
+
+    blocked_range3d( blocked_range3d& r, proportional_split& proportion ) :
+        my_pages(r.my_pages),
+        my_rows(r.my_rows),
+        my_cols(r.my_cols)
+    {
+        do_split(r, proportion);
+    }
+
+    //! The pages of the iteration space
+    const page_range_type& pages() const { return my_pages; }
+
+    //! The rows of the iteration space
+    const row_range_type& rows() const { return my_rows; }
+
+    //! The columns of the iteration space
+    const col_range_type& cols() const { return my_cols; }
+
+private:
+    template <typename Split>
+    void do_split( blocked_range3d& r, Split& split_obj) {
+        if ( my_pages.size()*double(my_rows.grainsize()) < my_rows.size()*double(my_pages.grainsize()) ) {
+            if ( my_rows.size()*double(my_cols.grainsize()) < my_cols.size()*double(my_rows.grainsize()) ) {
+                my_cols.my_begin = col_range_type::do_split(r.my_cols, split_obj);
+            } else {
+                my_rows.my_begin = row_range_type::do_split(r.my_rows, split_obj);
+            }
+        } else {
+            if ( my_pages.size()*double(my_cols.grainsize()) < my_cols.size()*double(my_pages.grainsize()) ) {
+                my_cols.my_begin = col_range_type::do_split(r.my_cols, split_obj);
+            } else {
+                my_pages.my_begin = page_range_type::do_split(r.my_pages, split_obj);
+            }
+        }
+    }
+};
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::blocked_range3d;
+} // namespace v1
+} // namespace tbb
+
+#endif /* __TBB_blocked_range3d_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/blocked_rangeNd.h b/contrib/libs/tbb/include/oneapi/tbb/blocked_rangeNd.h
new file mode 100644
index 0000000000..37b71da8fe
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/blocked_rangeNd.h
@@ -0,0 +1,144 @@
+/*
+    Copyright (c) 2017-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_blocked_rangeNd_H
+#define __TBB_blocked_rangeNd_H
+
+#if !TBB_PREVIEW_BLOCKED_RANGE_ND
+    #error Set TBB_PREVIEW_BLOCKED_RANGE_ND to include blocked_rangeNd.h
+#endif
+
+#include <algorithm>    // std::any_of
+#include <array>
+#include <cstddef>
+#include <type_traits>  // std::is_same, std::enable_if
+
+#include "detail/_config.h"
+#include "detail/_template_helpers.h" // index_sequence, make_index_sequence
+
+#include "blocked_range.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+/*
+    The blocked_rangeNd_impl uses make_index_sequence<N> to automatically generate a ctor with
+    exactly N arguments of the type tbb::blocked_range<Value>. Such ctor provides an opportunity
+    to use braced-init-list parameters to initialize each dimension.
+    Use of parameters, whose representation is a braced-init-list, but they're not
+    std::initializer_list or a reference to one, produces a non-deduced context
+    within template argument deduction.
+
+    NOTE: blocked_rangeNd must be exactly a templated alias to the blocked_rangeNd_impl
+    (and not e.g. a derived class), otherwise it would need to declare its own ctor
+    facing the same problem that the impl class solves.
+*/
+
+template<typename Value, unsigned int N, typename = detail::make_index_sequence<N>>
+class blocked_rangeNd_impl;
+
+template<typename Value, unsigned int N, std::size_t... Is>
+class blocked_rangeNd_impl<Value, N, detail::index_sequence<Is...>> {
+public:
+    //! Type of a value.
+    using value_type = Value;
+
+private:
+    //! Helper type to construct range with N tbb::blocked_range<value_type> objects.
+    template<std::size_t>
+    using dim_type_helper = tbb::blocked_range<value_type>;
+
+public:
+    blocked_rangeNd_impl() = delete;
+
+    //! Constructs N-dimensional range over N half-open intervals each represented as tbb::blocked_range<Value>.
+    blocked_rangeNd_impl(const dim_type_helper<Is>&... args) : my_dims{ {args...} } {}
+
+    //! Dimensionality of a range.
+    static constexpr unsigned int ndims() { return N; }
+
+    //! Range in certain dimension.
+    const tbb::blocked_range<value_type>& dim(unsigned int dimension) const {
+        __TBB_ASSERT(dimension < N, "out of bound");
+        return my_dims[dimension];
+    }
+
+    //------------------------------------------------------------------------
+    // Methods that implement Range concept
+    //------------------------------------------------------------------------
+
+    //! True if at least one dimension is empty.
+    bool empty() const {
+        return std::any_of(my_dims.begin(), my_dims.end(), [](const tbb::blocked_range<value_type>& d) {
+            return d.empty();
+        });
+    }
+
+    //! True if at least one dimension is divisible.
+    bool is_divisible() const {
+        return std::any_of(my_dims.begin(), my_dims.end(), [](const tbb::blocked_range<value_type>& d) {
+            return d.is_divisible();
+        });
+    }
+
+    blocked_rangeNd_impl(blocked_rangeNd_impl& r, proportional_split proportion) : my_dims(r.my_dims) {
+        do_split(r, proportion);
+    }
+
+    blocked_rangeNd_impl(blocked_rangeNd_impl& r, split proportion) : my_dims(r.my_dims) {
+        do_split(r, proportion);
+    }
+
+private:
+    static_assert(N != 0, "zero dimensional blocked_rangeNd can't be constructed");
+
+    //! Ranges in each dimension.
+    std::array<tbb::blocked_range<value_type>, N> my_dims;
+
+    template<typename split_type>
+    void do_split(blocked_rangeNd_impl& r, split_type proportion) {
+        static_assert((std::is_same<split_type, split>::value || std::is_same<split_type, proportional_split>::value), "type of split object is incorrect");
+        __TBB_ASSERT(r.is_divisible(), "can't split not divisible range");
+
+        auto my_it = std::max_element(my_dims.begin(), my_dims.end(), [](const tbb::blocked_range<value_type>& first, const tbb::blocked_range<value_type>& second) {
+            return (first.size() * second.grainsize() < second.size() * first.grainsize());
+        });
+
+        auto r_it = r.my_dims.begin() + (my_it - my_dims.begin());
+
+        my_it->my_begin = tbb::blocked_range<value_type>::do_split(*r_it, proportion);
+
+        // (!(my_it->my_begin < r_it->my_end) && !(r_it->my_end < my_it->my_begin)) equals to
+        // (my_it->my_begin == r_it->my_end), but we can't use operator== due to Value concept
+        __TBB_ASSERT(!(my_it->my_begin < r_it->my_end) && !(r_it->my_end < my_it->my_begin),
+                     "blocked_range has been split incorrectly");
+    }
+};
+
+template<typename Value, unsigned int N>
+using blocked_rangeNd = blocked_rangeNd_impl<Value, N>;
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::blocked_rangeNd;
+} // namespace v1
+} // namespace tbb
+
+#endif /* __TBB_blocked_rangeNd_H */
+
diff --git a/contrib/libs/tbb/include/oneapi/tbb/cache_aligned_allocator.h b/contrib/libs/tbb/include/oneapi/tbb/cache_aligned_allocator.h
new file mode 100644
index 0000000000..645f3fbd2e
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/cache_aligned_allocator.h
@@ -0,0 +1,189 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_cache_aligned_allocator_H
+#define __TBB_cache_aligned_allocator_H
+
+#include "detail/_utils.h"
+#include "detail/_namespace_injection.h"
+#include <cstdlib>
+#include <utility>
+
+#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT
+#error #include <memory_resource>
+#endif
+
+namespace tbb {
+namespace detail {
+
+namespace r1 {
+void*       __TBB_EXPORTED_FUNC cache_aligned_allocate(std::size_t size);
+void        __TBB_EXPORTED_FUNC cache_aligned_deallocate(void* p);
+std::size_t __TBB_EXPORTED_FUNC cache_line_size();
+}
+
+namespace d1 {
+
+template<typename T>
+class cache_aligned_allocator {
+public:
+    using value_type = T;
+    using propagate_on_container_move_assignment = std::true_type;
+
+    //! Always defined for TBB containers (supported since C++17 for std containers)
+    using is_always_equal = std::true_type;
+
+    cache_aligned_allocator() = default;
+    template<typename U> cache_aligned_allocator(const cache_aligned_allocator<U>&) noexcept {}
+
+    //! Allocate space for n objects, starting on a cache/sector line.
+    __TBB_nodiscard T* allocate(std::size_t n) {
+        return static_cast<T*>(r1::cache_aligned_allocate(n * sizeof(value_type)));
+    }
+
+    //! Free block of memory that starts on a cache line
+    void deallocate(T* p, std::size_t) {
+        r1::cache_aligned_deallocate(p);
+    }
+
+    //! Largest value for which method allocate might succeed.
+    std::size_t max_size() const noexcept {
+        return (~std::size_t(0) - r1::cache_line_size()) / sizeof(value_type);
+    }
+
+#if TBB_ALLOCATOR_TRAITS_BROKEN
+    using pointer = value_type*;
+    using const_pointer = const value_type*;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+    using difference_type = std::ptrdiff_t;
+    using size_type = std::size_t;
+    template<typename U> struct rebind {
+        using other = cache_aligned_allocator<U>;
+    };
+    template<typename U, typename... Args>
+    void construct(U *p, Args&&... args)
+        { ::new (p) U(std::forward<Args>(args)...); }
+    void destroy(pointer p) { p->~value_type(); }
+    pointer address(reference x) const { return &x; }
+    const_pointer address(const_reference x) const { return &x; }
+#endif // TBB_ALLOCATOR_TRAITS_BROKEN
+};
+
+#if TBB_ALLOCATOR_TRAITS_BROKEN
+    template<>
+    class cache_aligned_allocator<void> {
+    public:
+        using pointer = void*;
+        using const_pointer = const void*;
+        using value_type = void;
+        template<typename U> struct rebind {
+            using other = cache_aligned_allocator<U>;
+        };
+    };
+#endif
+
+template<typename T, typename U>
+bool operator==(const cache_aligned_allocator<T>&, const cache_aligned_allocator<U>&) noexcept { return true; }
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+template<typename T, typename U>
+bool operator!=(const cache_aligned_allocator<T>&, const cache_aligned_allocator<U>&) noexcept { return false; }
+#endif
+
+#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT
+
+//! C++17 memory resource wrapper to ensure cache line size alignment
+class cache_aligned_resource : public std::pmr::memory_resource {
+public:
+    cache_aligned_resource() : cache_aligned_resource(std::pmr::get_default_resource()) {}
+    explicit cache_aligned_resource(std::pmr::memory_resource* upstream) : m_upstream(upstream) {}
+
+    std::pmr::memory_resource* upstream_resource() const {
+        return m_upstream;
+    }
+
+private:
+    //! We don't know what memory resource set. Use padding to guarantee alignment
+    void* do_allocate(std::size_t bytes, std::size_t alignment) override {
+        // TODO: make it common with tbb_allocator.cpp
+        std::size_t cache_line_alignment = correct_alignment(alignment);
+        std::size_t space = correct_size(bytes) + cache_line_alignment;
+        std::uintptr_t base = reinterpret_cast<std::uintptr_t>(m_upstream->allocate(space));
+        __TBB_ASSERT(base != 0, "Upstream resource returned NULL.");
+
+        // Round up to the next cache line (align the base address)
+        std::uintptr_t result = (base + cache_line_alignment) & ~(cache_line_alignment - 1);
+        __TBB_ASSERT((result - base) >= sizeof(std::uintptr_t), "Can`t store a base pointer to the header");
+        __TBB_ASSERT(space - (result - base) >= bytes, "Not enough space for the storage");
+
+        // Record where block actually starts.
+        (reinterpret_cast<std::uintptr_t*>(result))[-1] = base;
+        return reinterpret_cast<void*>(result);
+    }
+
+    void do_deallocate(void* ptr, std::size_t bytes, std::size_t alignment) override {
+        if (ptr) {
+            // Recover where block actually starts
+            std::uintptr_t base = (reinterpret_cast<std::uintptr_t*>(ptr))[-1];
+            m_upstream->deallocate(reinterpret_cast<void*>(base), correct_size(bytes) + correct_alignment(alignment));
+        }
+    }
+
+    bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override {
+        if (this == &other) { return true; }
+#if __TBB_USE_OPTIONAL_RTTI
+        const cache_aligned_resource* other_res = dynamic_cast<const cache_aligned_resource*>(&other);
+        return other_res && (upstream_resource() == other_res->upstream_resource());
+#else
+        return false;
+#endif
+    }
+
+    std::size_t correct_alignment(std::size_t alignment) {
+        __TBB_ASSERT(tbb::detail::is_power_of_two(alignment), "Alignment is not a power of 2");
+#if __TBB_CPP17_HW_INTERFERENCE_SIZE_PRESENT
+        std::size_t cache_line_size = std::hardware_destructive_interference_size;
+#else
+        std::size_t cache_line_size = r1::cache_line_size();
+#endif
+        return alignment < cache_line_size ? cache_line_size : alignment;
+    }
+
+    std::size_t correct_size(std::size_t bytes) {
+        // To handle the case, when small size requested. There could be not
+        // enough space to store the original pointer.
+        return bytes < sizeof(std::uintptr_t) ? sizeof(std::uintptr_t) : bytes;
+    }
+
+    std::pmr::memory_resource* m_upstream;
+};
+
+#endif // __TBB_CPP17_MEMORY_RESOURCE_PRESENT
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::cache_aligned_allocator;
+#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT
+using detail::d1::cache_aligned_resource;
+#endif
+} // namespace v1
+} // namespace tbb
+
+#endif /* __TBB_cache_aligned_allocator_H */
+
diff --git a/contrib/libs/tbb/include/oneapi/tbb/combinable.h b/contrib/libs/tbb/include/oneapi/tbb/combinable.h
new file mode 100644
index 0000000000..b676a30cc0
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/combinable.h
@@ -0,0 +1,69 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_combinable_H
+#define __TBB_combinable_H
+
+#include "detail/_namespace_injection.h"
+
+#include "enumerable_thread_specific.h"
+#include "cache_aligned_allocator.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+/** \name combinable **/
+//@{
+//! Thread-local storage with optional reduction
+/** @ingroup containers */
+template <typename T>
+class combinable {
+    using my_alloc = typename tbb::cache_aligned_allocator<T>;
+    using my_ets_type = typename tbb::enumerable_thread_specific<T, my_alloc, ets_no_key>;
+    my_ets_type my_ets;
+
+public:
+    combinable() = default;
+
+    template <typename Finit>
+    explicit combinable(Finit _finit) : my_ets(_finit) { }
+
+    void clear() { my_ets.clear(); }
+
+    T& local() { return my_ets.local(); }
+
+    T& local(bool& exists) { return my_ets.local(exists); }
+
+    // combine_func_t has signature T(T,T) or T(const T&, const T&)
+    template <typename CombineFunc>
+    T combine(CombineFunc f_combine) { return my_ets.combine(f_combine); }
+
+    // combine_func_t has signature void(T) or void(const T&)
+    template <typename CombineFunc>
+    void combine_each(CombineFunc f_combine) { my_ets.combine_each(f_combine); }
+};
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::combinable;
+} // inline namespace v1
+
+} // namespace tbb
+
+#endif /* __TBB_combinable_H */
+
diff --git a/contrib/libs/tbb/include/oneapi/tbb/concurrent_hash_map.h b/contrib/libs/tbb/include/oneapi/tbb/concurrent_hash_map.h
new file mode 100644
index 0000000000..510557e9f2
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/concurrent_hash_map.h
@@ -0,0 +1,1524 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_hash_map_H
+#define __TBB_concurrent_hash_map_H
+
+#include "detail/_namespace_injection.h"
+#include "detail/_utils.h"
+#include "detail/_assert.h"
+#include "detail/_allocator_traits.h"
+#include "detail/_containers_helpers.h"
+#include "detail/_template_helpers.h"
+#include "detail/_hash_compare.h"
+#include "detail/_range_common.h"
+#include "tbb_allocator.h"
+#include "spin_rw_mutex.h"
+
+#include <atomic>
+#include <initializer_list>
+#include <tuple>
+#include <iterator>
+#include <utility>      // Need std::pair
+#include <cstring>      // Need std::memset
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+struct hash_map_node_base : no_copy {
+    using mutex_type = spin_rw_mutex;
+    // Scoped lock type for mutex
+    using scoped_type = mutex_type::scoped_lock;
+    // Next node in chain
+    hash_map_node_base* next;
+    mutex_type mutex;
+};
+
+// Incompleteness flag value
+static hash_map_node_base* const rehash_req = reinterpret_cast<hash_map_node_base*>(std::size_t(3));
+// Rehashed empty bucket flag
+static hash_map_node_base* const empty_rehashed = reinterpret_cast<hash_map_node_base*>(std::size_t(0));
+
+// base class of concurrent_hash_map
+
+template <typename Allocator>
+class hash_map_base {
+public:
+    using size_type = std::size_t;
+    using hashcode_type = std::size_t;
+    using segment_index_type = std::size_t;
+    using node_base = hash_map_node_base;
+
+    struct bucket : no_copy {
+        using mutex_type = spin_rw_mutex;
+        using scoped_type = mutex_type::scoped_lock;
+
+        bucket() : node_list(nullptr) {}
+        bucket( node_base* ptr ) : node_list(ptr) {}
+
+        mutex_type mutex;
+        std::atomic<node_base*> node_list;
+    };
+
+    using allocator_type = Allocator;
+    using allocator_traits_type = tbb::detail::allocator_traits<allocator_type>;
+    using bucket_allocator_type = typename allocator_traits_type::template rebind_alloc<bucket>;
+    using bucket_allocator_traits = tbb::detail::allocator_traits<bucket_allocator_type>;
+
+    // Count of segments in the first block
+    static constexpr size_type embedded_block = 1;
+    // Count of segments in the first block
+    static constexpr size_type embedded_buckets = 1 << embedded_block;
+    // Count of segments in the first block
+    static constexpr size_type first_block = 8; //including embedded_block. perfect with bucket size 16, so the allocations are power of 4096
+    // Size of a pointer / table size
+    static constexpr size_type pointers_per_table = sizeof(segment_index_type) * 8; // one segment per bit
+
+    using segment_ptr_type = bucket*;
+    using atomic_segment_type = std::atomic<segment_ptr_type>;
+    using segments_table_type = atomic_segment_type[pointers_per_table];
+
+    hash_map_base( const allocator_type& alloc ) : my_allocator(alloc), my_mask(embedded_buckets - 1), my_size(0) {
+        for (size_type i = 0; i != embedded_buckets; ++i) {
+            my_embedded_segment[i].node_list.store(nullptr, std::memory_order_relaxed);
+        }
+
+        for (size_type segment_index = 0; segment_index < pointers_per_table; ++segment_index) {
+            auto argument = segment_index < embedded_block ? my_embedded_segment + segment_base(segment_index) : nullptr;
+            my_table[segment_index].store(argument, std::memory_order_relaxed);
+        }
+
+        __TBB_ASSERT( embedded_block <= first_block, "The first block number must include embedded blocks");
+    }
+
+    // segment index of given index in the array
+    static segment_index_type segment_index_of( size_type index ) {
+        return segment_index_type(tbb::detail::log2( index|1 ));
+    }
+
+    // the first array index of given segment
+    static segment_index_type segment_base( segment_index_type k ) {
+        return (segment_index_type(1) << k & ~segment_index_type(1));
+    }
+
+    // segment size except for k == 0
+    static size_type segment_size( segment_index_type k ) {
+        return size_type(1) << k; // fake value for k==0
+    }
+
+    // true if ptr is valid pointer
+    static bool is_valid( void* ptr ) {
+        return reinterpret_cast<uintptr_t>(ptr) > uintptr_t(63);
+    }
+
+    template <typename... Args>
+    void init_buckets_impl( segment_ptr_type ptr, size_type sz, Args&&... args ) {
+        for (size_type i = 0; i < sz; ++i) {
+            bucket_allocator_traits::construct(my_allocator, ptr + i, std::forward<Args>(args)...);
+        }
+    }
+
+    // Initialize buckets
+    void init_buckets( segment_ptr_type ptr, size_type sz, bool is_initial ) {
+        if (is_initial) {
+            init_buckets_impl(ptr, sz);
+        } else {
+            init_buckets_impl(ptr, sz, reinterpret_cast<node_base*>(rehash_req));
+        }
+    }
+
+    // Add node n to bucket b
+    static void add_to_bucket( bucket* b, node_base* n ) {
+        __TBB_ASSERT(b->node_list.load(std::memory_order_relaxed) != rehash_req, nullptr);
+        n->next = b->node_list.load(std::memory_order_relaxed);
+        b->node_list.store(n, std::memory_order_relaxed); // its under lock and flag is set
+    }
+
+    const bucket_allocator_type& get_allocator() const {
+        return my_allocator;
+    }
+
+    bucket_allocator_type& get_allocator() {
+        return my_allocator;
+    }
+
+    // Enable segment
+    void enable_segment( segment_index_type k, bool is_initial = false ) {
+        __TBB_ASSERT( k, "Zero segment must be embedded" );
+        size_type sz;
+        __TBB_ASSERT( !is_valid(my_table[k].load(std::memory_order_relaxed)), "Wrong concurrent assignment");
+        if (k >= first_block) {
+            sz = segment_size(k);
+            segment_ptr_type ptr = nullptr;
+            try_call( [&] {
+                ptr = bucket_allocator_traits::allocate(my_allocator, sz);
+            } ).on_exception( [&] {
+                my_table[k].store(nullptr, std::memory_order_relaxed);
+            });
+
+            __TBB_ASSERT(ptr, nullptr);
+            init_buckets(ptr, sz, is_initial);
+            my_table[k].store(ptr, std::memory_order_release);
+            sz <<= 1;// double it to get entire capacity of the container
+        } else { // the first block
+            __TBB_ASSERT( k == embedded_block, "Wrong segment index" );
+            sz = segment_size(first_block);
+            segment_ptr_type ptr = nullptr;
+            try_call( [&] {
+                ptr = bucket_allocator_traits::allocate(my_allocator, sz - embedded_buckets);
+            } ).on_exception( [&] {
+                my_table[k].store(nullptr, std::memory_order_relaxed);
+            });
+
+            __TBB_ASSERT(ptr, nullptr);
+            init_buckets(ptr, sz - embedded_buckets, is_initial);
+            ptr -= segment_base(embedded_block);
+            for(segment_index_type i = embedded_block; i < first_block; i++) // calc the offsets
+                my_table[i].store(ptr + segment_base(i), std::memory_order_release);
+        }
+        my_mask.store(sz-1, std::memory_order_release);
+    }
+
+    void delete_segment( segment_index_type s ) {
+        segment_ptr_type buckets_ptr = my_table[s].load(std::memory_order_relaxed);
+        size_type sz = segment_size( s ? s : 1 );
+
+        size_type deallocate_size = 0;
+
+        if (s >= first_block) { // the first segment or the next
+            deallocate_size = sz;
+        } else if (s == embedded_block && embedded_block != first_block) {
+            deallocate_size = segment_size(first_block) - embedded_buckets;
+        }
+
+        for (size_type i = 0; i < deallocate_size; ++i) {
+            bucket_allocator_traits::destroy(my_allocator, buckets_ptr + i);
+        }
+        if (deallocate_size != 0) {
+            bucket_allocator_traits::deallocate(my_allocator, buckets_ptr, deallocate_size);
+        }
+
+        if (s >= embedded_block) my_table[s].store(nullptr, std::memory_order_relaxed);
+    }
+
+    // Get bucket by (masked) hashcode
+    bucket *get_bucket( hashcode_type h ) const noexcept {
+        segment_index_type s = segment_index_of( h );
+        h -= segment_base(s);
+        segment_ptr_type seg = my_table[s].load(std::memory_order_acquire);
+        __TBB_ASSERT( is_valid(seg), "hashcode must be cut by valid mask for allocated segments" );
+        return &seg[h];
+    }
+
+    // detail serial rehashing helper
+    void mark_rehashed_levels( hashcode_type h ) noexcept {
+        segment_index_type s = segment_index_of( h );
+        while (segment_ptr_type seg = my_table[++s].load(std::memory_order_relaxed))
+            if( seg[h].node_list.load(std::memory_order_relaxed) == rehash_req ) {
+                seg[h].node_list.store(empty_rehashed, std::memory_order_relaxed);
+                mark_rehashed_levels( h + ((hashcode_type)1<<s) ); // optimized segment_base(s)
+            }
+    }
+
+    // Check for mask race
+    // Splitting into two functions should help inlining
+    inline bool check_mask_race( const hashcode_type h, hashcode_type &m ) const {
+        hashcode_type m_now, m_old = m;
+        m_now = my_mask.load(std::memory_order_acquire);
+        if (m_old != m_now) {
+            return check_rehashing_collision(h, m_old, m = m_now);
+        }
+        return false;
+    }
+
+    // Process mask race, check for rehashing collision
+    bool check_rehashing_collision( const hashcode_type h, hashcode_type m_old, hashcode_type m ) const {
+        __TBB_ASSERT(m_old != m, nullptr); // TODO?: m arg could be optimized out by passing h = h&m
+        if( (h & m_old) != (h & m) ) { // mask changed for this hashcode, rare event
+            // condition above proves that 'h' has some other bits set beside 'm_old'
+            // find next applicable mask after m_old    //TODO: look at bsl instruction
+            for( ++m_old; !(h & m_old); m_old <<= 1 ) // at maximum few rounds depending on the first block size
+                ;
+            m_old = (m_old<<1) - 1; // get full mask from a bit
+            __TBB_ASSERT((m_old&(m_old+1))==0 && m_old <= m, nullptr);
+            // check whether it is rehashing/ed
+            if( get_bucket(h & m_old)->node_list.load(std::memory_order_acquire) != rehash_req ) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    // Insert a node and check for load factor. @return segment index to enable.
+    segment_index_type insert_new_node( bucket *b, node_base *n, hashcode_type mask ) {
+        size_type sz = ++my_size; // prefix form is to enforce allocation after the first item inserted
+        add_to_bucket( b, n );
+        // check load factor
+        if( sz >= mask ) { // TODO: add custom load_factor
+            segment_index_type new_seg = tbb::detail::log2( mask+1 ); //optimized segment_index_of
+            __TBB_ASSERT( is_valid(my_table[new_seg-1].load(std::memory_order_relaxed)), "new allocations must not publish new mask until segment has allocated");
+            static const segment_ptr_type is_allocating = segment_ptr_type(2);;
+            segment_ptr_type disabled = nullptr;
+            if (!(my_table[new_seg].load(std::memory_order_acquire))
+                && my_table[new_seg].compare_exchange_strong(disabled, is_allocating))
+                return new_seg; // The value must be processed
+        }
+        return 0;
+    }
+
+    // Prepare enough segments for number of buckets
+    void reserve(size_type buckets) {
+        if( !buckets-- ) return;
+        bool is_initial = !my_size.load(std::memory_order_relaxed);
+        for (size_type m = my_mask.load(std::memory_order_relaxed); buckets > m;
+            m = my_mask.load(std::memory_order_relaxed))
+        {
+            enable_segment( segment_index_of( m+1 ), is_initial );
+        }
+    }
+
+    // Swap hash_map_bases
+    void internal_swap_content(hash_map_base &table) {
+        using std::swap;
+        swap_atomics_relaxed(my_mask, table.my_mask);
+        swap_atomics_relaxed(my_size, table.my_size);
+
+        for(size_type i = 0; i < embedded_buckets; i++) {
+            auto temp = my_embedded_segment[i].node_list.load(std::memory_order_relaxed);
+            my_embedded_segment[i].node_list.store(table.my_embedded_segment[i].node_list.load(std::memory_order_relaxed),
+                std::memory_order_relaxed);
+            table.my_embedded_segment[i].node_list.store(temp, std::memory_order_relaxed);
+        }
+        for(size_type i = embedded_block; i < pointers_per_table; i++) {
+            auto temp = my_table[i].load(std::memory_order_relaxed);
+            my_table[i].store(table.my_table[i].load(std::memory_order_relaxed),
+                std::memory_order_relaxed);
+            table.my_table[i].store(temp, std::memory_order_relaxed);
+        }
+    }
+
+    void internal_move(hash_map_base&& other) {
+        my_mask.store(other.my_mask.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        other.my_mask.store(embedded_buckets - 1, std::memory_order_relaxed);
+
+        my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        other.my_size.store(0, std::memory_order_relaxed);
+
+        for (size_type i = 0; i < embedded_buckets; ++i) {
+            my_embedded_segment[i].node_list.store(other.my_embedded_segment[i].node_list, std::memory_order_relaxed);
+            other.my_embedded_segment[i].node_list.store(nullptr, std::memory_order_relaxed);
+        }
+
+        for (size_type i = embedded_block; i < pointers_per_table; ++i) {
+            my_table[i].store(other.my_table[i].load(std::memory_order_relaxed),
+                std::memory_order_relaxed);
+            other.my_table[i].store(nullptr, std::memory_order_relaxed);
+        }
+    }
+
+protected:
+
+    bucket_allocator_type my_allocator;
+    // Hash mask = sum of allocated segment sizes - 1
+    std::atomic<hashcode_type> my_mask;
+    // Size of container in stored items
+    std::atomic<size_type> my_size; // It must be in separate cache line from my_mask due to performance effects
+    // Zero segment
+    bucket my_embedded_segment[embedded_buckets];
+    // Segment pointers table. Also prevents false sharing between my_mask and my_size
+    segments_table_type my_table;
+};
+
+template <typename Iterator>
+class hash_map_range;
+
+// Meets requirements of a forward iterator for STL
+// Value is either the T or const T type of the container.
+template <typename Container, typename Value>
+class hash_map_iterator {
+    using map_type = Container;
+    using node = typename Container::node;
+    using map_base = typename Container::base_type;
+    using node_base = typename map_base::node_base;
+    using bucket = typename map_base::bucket;
+public:
+    using value_type = Value;
+    using size_type = typename Container::size_type;
+    using difference_type = typename Container::difference_type;
+    using pointer = value_type*;
+    using reference = value_type&;
+    using iterator_category = std::forward_iterator_tag;
+
+    // Construct undefined iterator
+    hash_map_iterator(): my_map(), my_index(), my_bucket(), my_node() {}
+    hash_map_iterator( const hash_map_iterator<Container, typename Container::value_type>& other ) :
+        my_map(other.my_map),
+        my_index(other.my_index),
+        my_bucket(other.my_bucket),
+        my_node(other.my_node)
+    {}
+
+    hash_map_iterator& operator=( const hash_map_iterator<Container, typename Container::value_type>& other ) {
+        my_map = other.my_map;
+        my_index = other.my_index;
+        my_bucket = other.my_bucket;
+        my_node = other.my_node;
+        return *this;
+    }
+
+    Value& operator*() const {
+        __TBB_ASSERT( map_base::is_valid(my_node), "iterator uninitialized or at end of container?" );
+        return my_node->value();
+    }
+
+    Value* operator->() const {return &operator*();}
+
+    hash_map_iterator& operator++() {
+        my_node = static_cast<node*>( my_node->next );
+        if( !my_node ) advance_to_next_bucket();
+        return *this;
+    }
+
+    // Post increment
+    hash_map_iterator operator++(int) {
+        hash_map_iterator old(*this);
+        operator++();
+        return old;
+    }
+private:
+    template <typename C, typename T, typename U>
+    friend bool operator==( const hash_map_iterator<C,T>& i, const hash_map_iterator<C,U>& j );
+
+    template <typename C, typename T, typename U>
+    friend bool operator!=( const hash_map_iterator<C,T>& i, const hash_map_iterator<C,U>& j );
+
+    template <typename C, typename T, typename U>
+    friend ptrdiff_t operator-( const hash_map_iterator<C,T>& i, const hash_map_iterator<C,U>& j );
+
+    template <typename C, typename U>
+    friend class hash_map_iterator;
+
+    template <typename I>
+    friend class hash_map_range;
+
+    void advance_to_next_bucket() { // TODO?: refactor to iterator_base class
+        size_t k = my_index+1;
+        __TBB_ASSERT( my_bucket, "advancing an invalid iterator?");
+        while (k <= my_map->my_mask.load(std::memory_order_relaxed)) {
+            // Following test uses 2's-complement wizardry
+            if( k&(k-2) ) // not the beginning of a segment
+                ++my_bucket;
+            else my_bucket = my_map->get_bucket( k );
+            my_node = static_cast<node*>( my_bucket->node_list.load(std::memory_order_relaxed) );
+            if( map_base::is_valid(my_node) ) {
+                my_index = k; return;
+            }
+            ++k;
+        }
+        my_bucket = 0; my_node = 0; my_index = k; // the end
+    }
+
+    template <typename Key, typename T, typename HashCompare, typename A>
+    friend class concurrent_hash_map;
+
+    hash_map_iterator( const Container &map, std::size_t index, const bucket *b, node_base *n ) :
+        my_map(&map), my_index(index), my_bucket(b), my_node(static_cast<node*>(n))
+    {
+        if( b && !map_base::is_valid(n) )
+            advance_to_next_bucket();
+    }
+
+    // concurrent_hash_map over which we are iterating.
+    const Container *my_map;
+    // Index in hash table for current item
+    size_t my_index;
+    // Pointer to bucket
+    const bucket* my_bucket;
+    // Pointer to node that has current item
+    node* my_node;
+};
+
+template <typename Container, typename T, typename U>
+bool operator==( const hash_map_iterator<Container,T>& i, const hash_map_iterator<Container,U>& j ) {
+    return i.my_node == j.my_node && i.my_map == j.my_map;
+}
+
+template <typename Container, typename T, typename U>
+bool operator!=( const hash_map_iterator<Container,T>& i, const hash_map_iterator<Container,U>& j ) {
+    return i.my_node != j.my_node || i.my_map != j.my_map;
+}
+
+// Range class used with concurrent_hash_map
+template <typename Iterator>
+class hash_map_range {
+    using map_type = typename Iterator::map_type;
+public:
+    // Type for size of a range
+    using size_type = std::size_t;
+    using value_type = typename Iterator::value_type;
+    using reference = typename Iterator::reference;
+    using difference_type = typename Iterator::difference_type;
+    using iterator = Iterator;
+
+    // True if range is empty.
+    bool empty() const {return my_begin == my_end;}
+
+    // True if range can be partitioned into two subranges.
+    bool is_divisible() const {
+        return my_midpoint != my_end;
+    }
+
+    // Split range.
+    hash_map_range( hash_map_range& r, split ) :
+        my_end(r.my_end),
+        my_grainsize(r.my_grainsize)
+    {
+        r.my_end = my_begin = r.my_midpoint;
+        __TBB_ASSERT( !empty(), "Splitting despite the range is not divisible" );
+        __TBB_ASSERT( !r.empty(), "Splitting despite the range is not divisible" );
+        set_midpoint();
+        r.set_midpoint();
+    }
+
+    // Init range with container and grainsize specified
+    hash_map_range( const map_type &map, size_type grainsize_ = 1 ) :
+        my_begin( Iterator( map, 0, map.my_embedded_segment, map.my_embedded_segment->node_list.load(std::memory_order_relaxed) ) ),
+        my_end( Iterator( map, map.my_mask.load(std::memory_order_relaxed) + 1, 0, 0 ) ),
+        my_grainsize( grainsize_ )
+    {
+        __TBB_ASSERT( grainsize_>0, "grainsize must be positive" );
+        set_midpoint();
+    }
+
+    const Iterator begin() const { return my_begin; }
+    const Iterator end() const { return my_end; }
+    // The grain size for this range.
+    size_type grainsize() const { return my_grainsize; }
+
+private:
+    Iterator my_begin;
+    Iterator my_end;
+    mutable Iterator my_midpoint;
+    size_t my_grainsize;
+    // Set my_midpoint to point approximately half way between my_begin and my_end.
+    void set_midpoint() const;
+    template <typename U> friend class hash_map_range;
+};
+
+template <typename Iterator>
+void hash_map_range<Iterator>::set_midpoint() const {
+    // Split by groups of nodes
+    size_t m = my_end.my_index-my_begin.my_index;
+    if( m > my_grainsize ) {
+        m = my_begin.my_index + m/2u;
+        auto b = my_begin.my_map->get_bucket(m);
+        my_midpoint = Iterator(*my_begin.my_map,m,b,b->node_list.load(std::memory_order_relaxed));
+    } else {
+        my_midpoint = my_end;
+    }
+    __TBB_ASSERT( my_begin.my_index <= my_midpoint.my_index,
+        "my_begin is after my_midpoint" );
+    __TBB_ASSERT( my_midpoint.my_index <= my_end.my_index,
+        "my_midpoint is after my_end" );
+    __TBB_ASSERT( my_begin != my_midpoint || my_begin == my_end,
+        "[my_begin, my_midpoint) range should not be empty" );
+}
+
+template <typename Key, typename T,
+          typename HashCompare = tbb_hash_compare<Key>,
+          typename Allocator = tbb_allocator<std::pair<const Key, T>>>
+class concurrent_hash_map : protected hash_map_base<Allocator> {
+    template <typename Container, typename Value>
+    friend class hash_map_iterator;
+
+    template <typename I>
+    friend class hash_map_range;
+    using allocator_traits_type = tbb::detail::allocator_traits<Allocator>;
+public:
+    using base_type = hash_map_base<Allocator>;
+    using key_type = Key;
+    using mapped_type = T;
+    // type_identity is needed to disable implicit deduction guides for std::initializer_list constructors
+    // and copy/move constructor with explicit allocator argument
+    using allocator_type = tbb::detail::type_identity_t<Allocator>;
+    using hash_compare_type = tbb::detail::type_identity_t<HashCompare>;
+    using value_type = std::pair<const Key, T>;
+    using size_type = typename base_type::size_type;
+    using difference_type = std::ptrdiff_t;
+
+    using pointer = typename allocator_traits_type::pointer;
+    using const_pointer = typename allocator_traits_type::const_pointer;
+
+    using reference = value_type&;
+    using const_reference = const value_type&;
+    using iterator = hash_map_iterator<concurrent_hash_map, value_type>;
+    using const_iterator = hash_map_iterator<concurrent_hash_map, const value_type>;
+    using range_type = hash_map_range<iterator>;
+    using const_range_type = hash_map_range<const_iterator>;
+
+protected:
+    static_assert(std::is_same<value_type, typename Allocator::value_type>::value,
+        "value_type of the container must be the same as its allocator's");
+
+    friend class const_accessor;
+    class node;
+    using segment_index_type = typename base_type::segment_index_type;
+    using segment_ptr_type = typename base_type::segment_ptr_type;
+    using node_base = typename base_type::node_base;
+    using bucket = typename base_type::bucket;
+    using hashcode_type = typename base_type::hashcode_type;
+    using bucket_allocator_type = typename base_type::bucket_allocator_type;
+    using node_allocator_type = typename base_type::allocator_traits_type::template rebind_alloc<node>;
+    using node_allocator_traits = tbb::detail::allocator_traits<node_allocator_type>;
+    hash_compare_type my_hash_compare;
+
+    class node : public node_base {
+    public:
+        node() {}
+        ~node() {}
+        pointer storage() { return &my_value; }
+        value_type& value() { return *storage(); }
+    private:
+        union {
+            value_type my_value;
+        };
+    };
+
+    void delete_node( node_base *n ) {
+        node_allocator_type node_allocator(this->get_allocator());
+        node_allocator_traits::destroy(node_allocator, static_cast<node*>(n)->storage());
+        node_allocator_traits::destroy(node_allocator, static_cast<node*>(n));
+        node_allocator_traits::deallocate(node_allocator, static_cast<node*>(n), 1);
+    }
+
+    template <typename... Args>
+    static node* create_node(bucket_allocator_type& allocator, Args&&... args) {
+        node_allocator_type node_allocator(allocator);
+        node* node_ptr = node_allocator_traits::allocate(node_allocator, 1);
+        auto guard = make_raii_guard([&] {
+            node_allocator_traits::destroy(node_allocator, node_ptr);
+            node_allocator_traits::deallocate(node_allocator, node_ptr, 1);
+        });
+
+        node_allocator_traits::construct(node_allocator, node_ptr);
+        node_allocator_traits::construct(node_allocator, node_ptr->storage(), std::forward<Args>(args)...);
+        guard.dismiss();
+        return node_ptr;
+    }
+
+    static node* allocate_node_copy_construct(bucket_allocator_type& allocator, const Key &key, const T * t){
+        return create_node(allocator, key, *t);
+    }
+
+    static node* allocate_node_move_construct(bucket_allocator_type& allocator, const Key &key, const T * t){
+        return create_node(allocator, key, std::move(*const_cast<T*>(t)));
+    }
+
+    static node* allocate_node_default_construct(bucket_allocator_type& allocator, const Key &key, const T * ){
+        // Emplace construct an empty T object inside the pair
+        return create_node(allocator, std::piecewise_construct,
+                           std::forward_as_tuple(key), std::forward_as_tuple());
+    }
+
+    static node* do_not_allocate_node(bucket_allocator_type& , const Key &, const T * ){
+        __TBB_ASSERT(false,"this dummy function should not be called");
+        return nullptr;
+    }
+
+    node *search_bucket( const key_type &key, bucket *b ) const {
+        node *n = static_cast<node*>( b->node_list.load(std::memory_order_relaxed) );
+        while (this->is_valid(n) && !my_hash_compare.equal(key, n->value().first))
+            n = static_cast<node*>( n->next );
+        __TBB_ASSERT(n != rehash_req, "Search can be executed only for rehashed bucket");
+        return n;
+    }
+
+    // bucket accessor is to find, rehash, acquire a lock, and access a bucket
+    class bucket_accessor : public bucket::scoped_type {
+        bucket *my_b;
+    public:
+        bucket_accessor( concurrent_hash_map *base, const hashcode_type h, bool writer = false ) { acquire( base, h, writer ); }
+        // find a bucket by masked hashcode, optionally rehash, and acquire the lock
+        inline void acquire( concurrent_hash_map *base, const hashcode_type h, bool writer = false ) {
+            my_b = base->get_bucket( h );
+            // TODO: actually, notification is unnecessary here, just hiding double-check
+            if( my_b->node_list.load(std::memory_order_acquire) == rehash_req
+                && bucket::scoped_type::try_acquire( my_b->mutex, /*write=*/true ) )
+            {
+                if( my_b->node_list.load(std::memory_order_relaxed) == rehash_req ) base->rehash_bucket( my_b, h ); //recursive rehashing
+            }
+            else bucket::scoped_type::acquire( my_b->mutex, writer );
+            __TBB_ASSERT( my_b->node_list.load(std::memory_order_relaxed) != rehash_req, nullptr);
+        }
+        // check whether bucket is locked for write
+        bool is_writer() { return bucket::scoped_type::m_is_writer; }
+        // get bucket pointer
+        bucket *operator() () { return my_b; }
+    };
+
+    // TODO refactor to hash_base
+    void rehash_bucket( bucket *b_new, const hashcode_type hash ) {
+        __TBB_ASSERT( *(intptr_t*)(&b_new->mutex), "b_new must be locked (for write)");
+        __TBB_ASSERT( hash > 1, "The lowermost buckets can't be rehashed" );
+        b_new->node_list.store(empty_rehashed, std::memory_order_release); // mark rehashed
+        hashcode_type mask = (1u << tbb::detail::log2(hash)) - 1; // get parent mask from the topmost bit
+        bucket_accessor b_old( this, hash & mask );
+
+        mask = (mask<<1) | 1; // get full mask for new bucket
+        __TBB_ASSERT( (mask&(mask+1))==0 && (hash & mask) == hash, nullptr );
+    restart:
+        node_base* prev = nullptr;
+        node_base* curr = b_old()->node_list.load(std::memory_order_acquire);
+        while (this->is_valid(curr)) {
+            hashcode_type curr_node_hash = my_hash_compare.hash(static_cast<node*>(curr)->value().first);
+
+            if ((curr_node_hash & mask) == hash) {
+                if (!b_old.is_writer()) {
+                    if (!b_old.upgrade_to_writer()) {
+                        goto restart; // node ptr can be invalid due to concurrent erase
+                    }
+                }
+                node_base* next = curr->next;
+                // exclude from b_old
+                if (prev == nullptr) {
+                    b_old()->node_list.store(curr->next, std::memory_order_relaxed);
+                } else {
+                    prev->next = curr->next;
+                }
+                this->add_to_bucket(b_new, curr);
+                curr = next;
+            } else {
+                prev = curr;
+                curr = curr->next;
+            }
+        }
+    }
+
+public:
+
+    class accessor;
+    // Combines data access, locking, and garbage collection.
+    class const_accessor : private node::scoped_type /*which derived from no_copy*/ {
+        friend class concurrent_hash_map<Key,T,HashCompare,Allocator>;
+        friend class accessor;
+    public:
+        // Type of value
+        using value_type = const typename concurrent_hash_map::value_type;
+
+        // True if result is empty.
+        bool empty() const { return !my_node; }
+
+        // Set to null
+        void release() {
+            if( my_node ) {
+                node::scoped_type::release();
+                my_node = 0;
+            }
+        }
+
+        // Return reference to associated value in hash table.
+        const_reference operator*() const {
+            __TBB_ASSERT( my_node, "attempt to dereference empty accessor" );
+            return my_node->value();
+        }
+
+        // Return pointer to associated value in hash table.
+        const_pointer operator->() const {
+            return &operator*();
+        }
+
+        // Create empty result
+        const_accessor() : my_node(nullptr) {}
+
+        // Destroy result after releasing the underlying reference.
+        ~const_accessor() {
+            my_node = nullptr; // scoped lock's release() is called in its destructor
+        }
+    protected:
+        bool is_writer() { return node::scoped_type::m_is_writer; }
+        node *my_node;
+        hashcode_type my_hash;
+    };
+
+    // Allows write access to elements and combines data access, locking, and garbage collection.
+    class accessor: public const_accessor {
+    public:
+        // Type of value
+        using value_type = typename concurrent_hash_map::value_type;
+
+        // Return reference to associated value in hash table.
+        reference operator*() const {
+            __TBB_ASSERT( this->my_node, "attempt to dereference empty accessor" );
+            return this->my_node->value();
+        }
+
+        // Return pointer to associated value in hash table.
+        pointer operator->() const {
+            return &operator*();
+        }
+    };
+
+    explicit concurrent_hash_map( const hash_compare_type& compare, const allocator_type& a = allocator_type() )
+        : base_type(a)
+        , my_hash_compare(compare)
+    {}
+
+    concurrent_hash_map() : concurrent_hash_map(hash_compare_type()) {}
+
+    explicit concurrent_hash_map( const allocator_type& a )
+        : concurrent_hash_map(hash_compare_type(), a)
+    {}
+
+    // Construct empty table with n preallocated buckets. This number serves also as initial concurrency level.
+    concurrent_hash_map( size_type n, const allocator_type &a = allocator_type() )
+        : concurrent_hash_map(a)
+    {
+        this->reserve(n);
+    }
+
+    concurrent_hash_map( size_type n, const hash_compare_type& compare, const allocator_type& a = allocator_type() )
+        : concurrent_hash_map(compare, a)
+    {
+        this->reserve(n);
+    }
+
+    // Copy constructor
+    concurrent_hash_map( const concurrent_hash_map &table )
+        : concurrent_hash_map(node_allocator_traits::select_on_container_copy_construction(table.get_allocator()))
+    {
+        try_call( [&] {
+            internal_copy(table);
+        }).on_exception( [&] {
+            this->clear();
+        });
+    }
+
+    concurrent_hash_map( const concurrent_hash_map &table, const allocator_type &a)
+        : concurrent_hash_map(a)
+    {
+        try_call( [&] {
+            internal_copy(table);
+        }).on_exception( [&] {
+            this->clear();
+        });
+    }
+
+    // Move constructor
+    concurrent_hash_map( concurrent_hash_map &&table )
+        : concurrent_hash_map(std::move(table.get_allocator()))
+    {
+        this->internal_move(std::move(table));
+    }
+
+    // Move constructor
+    concurrent_hash_map( concurrent_hash_map &&table, const allocator_type &a )
+        : concurrent_hash_map(a)
+    {
+        using is_equal_type = typename node_allocator_traits::is_always_equal;
+        internal_move_construct_with_allocator(std::move(table), a, is_equal_type());
+    }
+
+    // Construction with copying iteration range and given allocator instance
+    template <typename I>
+    concurrent_hash_map( I first, I last, const allocator_type &a = allocator_type() )
+        : concurrent_hash_map(a)
+    {
+        try_call( [&] {
+            internal_copy(first, last, std::distance(first, last));
+        }).on_exception( [&] {
+            this->clear();
+        });
+    }
+
+    template <typename I>
+    concurrent_hash_map( I first, I last, const hash_compare_type& compare, const allocator_type& a = allocator_type() )
+        : concurrent_hash_map(compare, a)
+    {
+        try_call( [&] {
+            internal_copy(first, last, std::distance(first, last));
+        }).on_exception( [&] {
+            this->clear();
+        });
+    }
+
+    concurrent_hash_map( std::initializer_list<value_type> il, const hash_compare_type& compare = hash_compare_type(), const allocator_type& a = allocator_type() )
+        : concurrent_hash_map(compare, a)
+    {
+        try_call( [&] {
+            internal_copy(il.begin(), il.end(), il.size());
+        }).on_exception( [&] {
+            this->clear();
+        });
+    }
+
+    concurrent_hash_map( std::initializer_list<value_type> il, const allocator_type& a )
+        : concurrent_hash_map(il, hash_compare_type(), a) {}
+
+    // Assignment
+    concurrent_hash_map& operator=( const concurrent_hash_map &table ) {
+        if( this != &table ) {
+            clear();
+            copy_assign_allocators(this->my_allocator, table.my_allocator);
+            internal_copy(table);
+        }
+        return *this;
+    }
+
+    // Move Assignment
+    concurrent_hash_map& operator=( concurrent_hash_map &&table ) {
+        if( this != &table ) {
+            using pocma_type = typename node_allocator_traits::propagate_on_container_move_assignment;
+            using is_equal_type = typename node_allocator_traits::is_always_equal;
+            move_assign_allocators(this->my_allocator, table.my_allocator);
+            internal_move_assign(std::move(table), tbb::detail::disjunction<is_equal_type, pocma_type>());
+        }
+        return *this;
+    }
+
+    // Assignment
+    concurrent_hash_map& operator=( std::initializer_list<value_type> il ) {
+        clear();
+        internal_copy(il.begin(), il.end(), il.size());
+        return *this;
+    }
+
+    // Rehashes and optionally resizes the whole table.
+    /** Useful to optimize performance before or after concurrent operations.
+        Also enables using of find() and count() concurrent methods in serial context. */
+    void rehash(size_type sz = 0) {
+        this->reserve(sz); // TODO: add reduction of number of buckets as well
+        hashcode_type mask = this->my_mask.load(std::memory_order_relaxed);
+        hashcode_type b = (mask+1)>>1; // size or first index of the last segment
+        __TBB_ASSERT((b&(b-1))==0, nullptr); // zero or power of 2
+        bucket *bp = this->get_bucket( b ); // only the last segment should be scanned for rehashing
+        for(; b <= mask; b++, bp++ ) {
+            node_base *n = bp->node_list.load(std::memory_order_relaxed);
+            __TBB_ASSERT( this->is_valid(n) || n == empty_rehashed || n == rehash_req, "Broken detail structure" );
+            __TBB_ASSERT( *reinterpret_cast<intptr_t*>(&bp->mutex) == 0, "concurrent or unexpectedly terminated operation during rehash() execution" );
+            if( n == rehash_req ) { // rehash bucket, conditional because rehashing of a previous bucket may affect this one
+                hashcode_type h = b; bucket *b_old = bp;
+                do {
+                    __TBB_ASSERT( h > 1, "The lowermost buckets can't be rehashed" );
+                    hashcode_type m = ( 1u<<tbb::detail::log2( h ) ) - 1; // get parent mask from the topmost bit
+                    b_old = this->get_bucket( h &= m );
+                } while( b_old->node_list.load(std::memory_order_relaxed) == rehash_req );
+                // now h - is index of the root rehashed bucket b_old
+                this->mark_rehashed_levels( h ); // mark all non-rehashed children recursively across all segments
+                node_base* prev = nullptr;
+                node_base* curr = b_old->node_list.load(std::memory_order_relaxed);
+                while (this->is_valid(curr)) {
+                    hashcode_type curr_node_hash = my_hash_compare.hash(static_cast<node*>(curr)->value().first);
+
+                    if ((curr_node_hash & mask) != h) { // should be rehashed
+                        node_base* next = curr->next;
+                        // exclude from b_old
+                        if (prev == nullptr) {
+                            b_old->node_list.store(curr->next, std::memory_order_relaxed);
+                        } else {
+                            prev->next = curr->next;
+                        }
+                        bucket *b_new = this->get_bucket(curr_node_hash & mask);
+                        __TBB_ASSERT(b_new->node_list.load(std::memory_order_relaxed) != rehash_req, "hash() function changed for key in table or detail error" );
+                        this->add_to_bucket(b_new, curr);
+                        curr = next;
+                    } else {
+                        prev = curr;
+                        curr = curr->next;
+                    }
+                }
+            }
+        }
+    }
+
+    // Clear table
+    void clear() {
+        hashcode_type m = this->my_mask.load(std::memory_order_relaxed);
+        __TBB_ASSERT((m&(m+1))==0, "data structure is invalid");
+        this->my_size.store(0, std::memory_order_relaxed);
+        segment_index_type s = this->segment_index_of( m );
+        __TBB_ASSERT( s+1 == this->pointers_per_table || !this->my_table[s+1].load(std::memory_order_relaxed), "wrong mask or concurrent grow" );
+        do {
+            __TBB_ASSERT(this->is_valid(this->my_table[s].load(std::memory_order_relaxed)), "wrong mask or concurrent grow" );
+            segment_ptr_type buckets_ptr = this->my_table[s].load(std::memory_order_relaxed);
+            size_type sz = this->segment_size( s ? s : 1 );
+            for( segment_index_type i = 0; i < sz; i++ )
+                for( node_base *n = buckets_ptr[i].node_list.load(std::memory_order_relaxed);
+                    this->is_valid(n); n = buckets_ptr[i].node_list.load(std::memory_order_relaxed) )
+                {
+                    buckets_ptr[i].node_list.store(n->next, std::memory_order_relaxed);
+                    delete_node( n );
+                }
+            this->delete_segment(s);
+        } while(s-- > 0);
+        this->my_mask.store(this->embedded_buckets - 1, std::memory_order_relaxed);
+    }
+
+    // Clear table and destroy it.
+    ~concurrent_hash_map() { clear(); }
+
+    //------------------------------------------------------------------------
+    // Parallel algorithm support
+    //------------------------------------------------------------------------
+    range_type range( size_type grainsize=1 ) {
+        return range_type( *this, grainsize );
+    }
+    const_range_type range( size_type grainsize=1 ) const {
+        return const_range_type( *this, grainsize );
+    }
+
+    //------------------------------------------------------------------------
+    // STL support - not thread-safe methods
+    //------------------------------------------------------------------------
+    iterator begin() { return iterator( *this, 0, this->my_embedded_segment, this->my_embedded_segment->node_list.load(std::memory_order_relaxed) ); }
+    const_iterator begin() const { return const_iterator( *this, 0, this->my_embedded_segment, this->my_embedded_segment->node_list.load(std::memory_order_relaxed) ); }
+    const_iterator cbegin() const { return const_iterator( *this, 0, this->my_embedded_segment, this->my_embedded_segment->node_list.load(std::memory_order_relaxed) ); }
+    iterator end() { return iterator( *this, 0, 0, 0 ); }
+    const_iterator end() const { return const_iterator( *this, 0, 0, 0 ); }
+    const_iterator cend() const { return const_iterator( *this, 0, 0, 0 ); }
+    std::pair<iterator, iterator> equal_range( const Key& key ) { return internal_equal_range( key, end() ); }
+    std::pair<const_iterator, const_iterator> equal_range( const Key& key ) const { return internal_equal_range( key, end() ); }
+
+    // Number of items in table.
+    size_type size() const { return this->my_size.load(std::memory_order_acquire); }
+
+    // True if size()==0.
+    __TBB_nodiscard bool empty() const { return size() == 0; }
+
+    // Upper bound on size.
+    size_type max_size() const {
+        return allocator_traits_type::max_size(base_type::get_allocator());
+    }
+
+    // Returns the current number of buckets
+    size_type bucket_count() const { return this->my_mask.load(std::memory_order_relaxed) + 1; }
+
+    // return allocator object
+    allocator_type get_allocator() const { return base_type::get_allocator(); }
+
+    // swap two instances. Iterators are invalidated
+    void swap(concurrent_hash_map& table) {
+        using pocs_type = typename node_allocator_traits::propagate_on_container_swap;
+        using is_equal_type = typename node_allocator_traits::is_always_equal;
+        swap_allocators(this->my_allocator, table.my_allocator);
+        internal_swap(table, tbb::detail::disjunction<pocs_type, is_equal_type>());
+    }
+
+    //------------------------------------------------------------------------
+    // concurrent map operations
+    //------------------------------------------------------------------------
+
+    // Return count of items (0 or 1)
+    size_type count( const Key &key ) const {
+        return const_cast<concurrent_hash_map*>(this)->lookup(/*insert*/false, key, nullptr, nullptr, /*write=*/false, &do_not_allocate_node );
+    }
+
+    // Find item and acquire a read lock on the item.
+    /** Return true if item is found, false otherwise. */
+    bool find( const_accessor &result, const Key &key ) const {
+        result.release();
+        return const_cast<concurrent_hash_map*>(this)->lookup(/*insert*/false, key, nullptr, &result, /*write=*/false, &do_not_allocate_node );
+    }
+
+    // Find item and acquire a write lock on the item.
+    /** Return true if item is found, false otherwise. */
+    bool find( accessor &result, const Key &key ) {
+        result.release();
+        return lookup(/*insert*/false, key, nullptr, &result, /*write=*/true, &do_not_allocate_node );
+    }
+
+    // Insert item (if not already present) and acquire a read lock on the item.
+    /** Returns true if item is new. */
+    bool insert( const_accessor &result, const Key &key ) {
+        result.release();
+        return lookup(/*insert*/true, key, nullptr, &result, /*write=*/false, &allocate_node_default_construct );
+    }
+
+    // Insert item (if not already present) and acquire a write lock on the item.
+    /** Returns true if item is new. */
+    bool insert( accessor &result, const Key &key ) {
+        result.release();
+        return lookup(/*insert*/true, key, nullptr, &result, /*write=*/true, &allocate_node_default_construct );
+    }
+
+    // Insert item by copying if there is no such key present already and acquire a read lock on the item.
+    /** Returns true if item is new. */
+    bool insert( const_accessor &result, const value_type &value ) {
+        result.release();
+        return lookup(/*insert*/true, value.first, &value.second, &result, /*write=*/false, &allocate_node_copy_construct );
+    }
+
+    // Insert item by copying if there is no such key present already and acquire a write lock on the item.
+    /** Returns true if item is new. */
+    bool insert( accessor &result, const value_type &value ) {
+        result.release();
+        return lookup(/*insert*/true, value.first, &value.second, &result, /*write=*/true, &allocate_node_copy_construct );
+    }
+
+    // Insert item by copying if there is no such key present already
+    /** Returns true if item is inserted. */
+    bool insert( const value_type &value ) {
+        return lookup(/*insert*/true, value.first, &value.second, nullptr, /*write=*/false, &allocate_node_copy_construct );
+    }
+
+    // Insert item by copying if there is no such key present already and acquire a read lock on the item.
+    /** Returns true if item is new. */
+    bool insert( const_accessor &result, value_type && value ) {
+        return generic_move_insert(result, std::move(value));
+    }
+
+    // Insert item by copying if there is no such key present already and acquire a write lock on the item.
+    /** Returns true if item is new. */
+    bool insert( accessor &result, value_type && value ) {
+        return generic_move_insert(result, std::move(value));
+    }
+
+    // Insert item by copying if there is no such key present already
+    /** Returns true if item is inserted. */
+    bool insert( value_type && value ) {
+        return generic_move_insert(accessor_not_used(), std::move(value));
+    }
+
+    // Insert item by copying if there is no such key present already and acquire a read lock on the item.
+    /** Returns true if item is new. */
+    template <typename... Args>
+    bool emplace( const_accessor &result, Args&&... args ) {
+        return generic_emplace(result, std::forward<Args>(args)...);
+    }
+
+    // Insert item by copying if there is no such key present already and acquire a write lock on the item.
+    /** Returns true if item is new. */
+    template <typename... Args>
+    bool emplace( accessor &result, Args&&... args ) {
+        return generic_emplace(result, std::forward<Args>(args)...);
+    }
+
+    // Insert item by copying if there is no such key present already
+    /** Returns true if item is inserted. */
+    template <typename... Args>
+    bool emplace( Args&&... args ) {
+        return generic_emplace(accessor_not_used(), std::forward<Args>(args)...);
+    }
+
+    // Insert range [first, last)
+    template <typename I>
+    void insert( I first, I last ) {
+        for ( ; first != last; ++first )
+            insert( *first );
+    }
+
+    // Insert initializer list
+    void insert( std::initializer_list<value_type> il ) {
+        insert( il.begin(), il.end() );
+    }
+
+    // Erase item.
+    /** Return true if item was erased by particularly this call. */
+    bool erase( const Key &key ) {
+        node_base *erase_node;
+        hashcode_type const hash = my_hash_compare.hash(key);
+        hashcode_type mask = this->my_mask.load(std::memory_order_acquire);
+    restart:
+        {//lock scope
+            // get bucket
+            bucket_accessor b( this, hash & mask );
+        search:
+            node_base* prev = nullptr;
+            erase_node = b()->node_list.load(std::memory_order_relaxed);
+            while (this->is_valid(erase_node) && !my_hash_compare.equal(key, static_cast<node*>(erase_node)->value().first ) ) {
+                prev = erase_node;
+                erase_node = erase_node->next;
+            }
+
+            if (erase_node == nullptr) { // not found, but mask could be changed
+                if (this->check_mask_race(hash, mask))
+                    goto restart;
+                return false;
+            } else if (!b.is_writer() && !b.upgrade_to_writer()) {
+                if (this->check_mask_race(hash, mask)) // contended upgrade, check mask
+                    goto restart;
+                goto search;
+            }
+
+            // remove from container
+            if (prev == nullptr) {
+                b()->node_list.store(erase_node->next, std::memory_order_relaxed);
+            } else {
+                prev->next = erase_node->next;
+            }
+            this->my_size--;
+        }
+        {
+            typename node::scoped_type item_locker( erase_node->mutex, /*write=*/true );
+        }
+        // note: there should be no threads pretending to acquire this mutex again, do not try to upgrade const_accessor!
+        delete_node(erase_node); // Only one thread can delete it due to write lock on the bucket
+        return true;
+    }
+
+    // Erase item by const_accessor.
+    /** Return true if item was erased by particularly this call. */
+    bool erase( const_accessor& item_accessor ) {
+        return exclude( item_accessor );
+    }
+
+    // Erase item by accessor.
+    /** Return true if item was erased by particularly this call. */
+    bool erase( accessor& item_accessor ) {
+        return exclude( item_accessor );
+    }
+
+protected:
+    // Insert or find item and optionally acquire a lock on the item.
+    bool lookup( bool op_insert, const Key &key, const T *t, const_accessor *result, bool write, node* (*allocate_node)(bucket_allocator_type&,
+        const Key&, const T*), node *tmp_n  = 0)
+    {
+        __TBB_ASSERT( !result || !result->my_node, nullptr );
+        bool return_value;
+        hashcode_type const h = my_hash_compare.hash( key );
+        hashcode_type m = this->my_mask.load(std::memory_order_acquire);
+        segment_index_type grow_segment = 0;
+        node *n;
+        restart:
+        {//lock scope
+            __TBB_ASSERT((m&(m+1))==0, "data structure is invalid");
+            return_value = false;
+            // get bucket
+            bucket_accessor b( this, h & m );
+            // find a node
+            n = search_bucket( key, b() );
+            if( op_insert ) {
+                // [opt] insert a key
+                if( !n ) {
+                    if( !tmp_n ) {
+                        tmp_n = allocate_node(base_type::get_allocator(), key, t);
+                    }
+                    if( !b.is_writer() && !b.upgrade_to_writer() ) { // TODO: improved insertion
+                        // Rerun search_list, in case another thread inserted the item during the upgrade.
+                        n = search_bucket( key, b() );
+                        if( this->is_valid(n) ) { // unfortunately, it did
+                            b.downgrade_to_reader();
+                            goto exists;
+                        }
+                    }
+                    if( this->check_mask_race(h, m) )
+                        goto restart; // b.release() is done in ~b().
+                    // insert and set flag to grow the container
+                    grow_segment = this->insert_new_node( b(), n = tmp_n, m );
+                    tmp_n = 0;
+                    return_value = true;
+                }
+            } else { // find or count
+                if( !n ) {
+                    if( this->check_mask_race( h, m ) )
+                        goto restart; // b.release() is done in ~b(). TODO: replace by continue
+                    return false;
+                }
+                return_value = true;
+            }
+        exists:
+            if( !result ) goto check_growth;
+            // TODO: the following seems as generic/regular operation
+            // acquire the item
+            if( !result->try_acquire( n->mutex, write ) ) {
+                for( tbb::detail::atomic_backoff backoff(true);; ) {
+                    if( result->try_acquire( n->mutex, write ) ) break;
+                    if( !backoff.bounded_pause() ) {
+                        // the wait takes really long, restart the operation
+                        b.release();
+                        __TBB_ASSERT( !op_insert || !return_value, "Can't acquire new item in locked bucket?" );
+                        yield();
+                        m = this->my_mask.load(std::memory_order_acquire);
+                        goto restart;
+                    }
+                }
+            }
+        }//lock scope
+        result->my_node = n;
+        result->my_hash = h;
+    check_growth:
+        // [opt] grow the container
+        if( grow_segment ) {
+            this->enable_segment( grow_segment );
+        }
+        if( tmp_n ) // if op_insert only
+            delete_node( tmp_n );
+        return return_value;
+    }
+
+    struct accessor_not_used { void release(){}};
+    friend const_accessor* accessor_location( accessor_not_used const& ){ return nullptr;}
+    friend const_accessor* accessor_location( const_accessor & a )      { return &a;}
+
+    friend bool is_write_access_needed( accessor const& )           { return true;}
+    friend bool is_write_access_needed( const_accessor const& )     { return false;}
+    friend bool is_write_access_needed( accessor_not_used const& )  { return false;}
+
+    template <typename Accessor>
+    bool generic_move_insert( Accessor && result, value_type && value ) {
+        result.release();
+        return lookup(/*insert*/true, value.first, &value.second, accessor_location(result), is_write_access_needed(result), &allocate_node_move_construct );
+    }
+
+    template <typename Accessor, typename... Args>
+    bool generic_emplace( Accessor && result, Args &&... args ) {
+        result.release();
+        node * node_ptr = create_node(base_type::get_allocator(), std::forward<Args>(args)...);
+        return lookup(/*insert*/true, node_ptr->value().first, nullptr, accessor_location(result), is_write_access_needed(result), &do_not_allocate_node, node_ptr );
+    }
+
+    // delete item by accessor
+    bool exclude( const_accessor &item_accessor ) {
+        __TBB_ASSERT( item_accessor.my_node, nullptr );
+        node_base *const exclude_node = item_accessor.my_node;
+        hashcode_type const hash = item_accessor.my_hash;
+        hashcode_type mask = this->my_mask.load(std::memory_order_acquire);
+        do {
+            // get bucket
+            bucket_accessor b( this, hash & mask, /*writer=*/true );
+            node_base* prev = nullptr;
+            node_base* curr = b()->node_list.load(std::memory_order_relaxed);
+
+            while (curr && curr != exclude_node) {
+                prev = curr;
+                curr = curr->next;
+            }
+
+            if (curr == nullptr) { // someone else was first
+                if (this->check_mask_race(hash, mask))
+                    continue;
+                item_accessor.release();
+                return false;
+            }
+            __TBB_ASSERT( curr == exclude_node, nullptr );
+            // remove from container
+            if (prev == nullptr) {
+                b()->node_list.store(curr->next, std::memory_order_relaxed);
+            } else {
+                prev->next = curr->next;
+            }
+
+            this->my_size--;
+            break;
+        } while(true);
+        if (!item_accessor.is_writer()) { // need to get exclusive lock
+            item_accessor.upgrade_to_writer(); // return value means nothing here
+        }
+
+        item_accessor.release();
+        delete_node(exclude_node); // Only one thread can delete it
+        return true;
+    }
+
+    // Returns an iterator for an item defined by the key, or for the next item after it (if upper==true)
+    template <typename I>
+    std::pair<I, I> internal_equal_range( const Key& key, I end_ ) const {
+        hashcode_type h = my_hash_compare.hash( key );
+        hashcode_type m = this->my_mask.load(std::memory_order_relaxed);
+        __TBB_ASSERT((m&(m+1))==0, "data structure is invalid");
+        h &= m;
+        bucket *b = this->get_bucket( h );
+        while ( b->node_list.load(std::memory_order_relaxed) == rehash_req ) {
+            m = ( 1u<<tbb::detail::log2( h ) ) - 1; // get parent mask from the topmost bit
+            b = this->get_bucket( h &= m );
+        }
+        node *n = search_bucket( key, b );
+        if( !n )
+            return std::make_pair(end_, end_);
+        iterator lower(*this, h, b, n), upper(lower);
+        return std::make_pair(lower, ++upper);
+    }
+
+    // Copy "source" to *this, where *this must start out empty.
+    void internal_copy( const concurrent_hash_map& source ) {
+        hashcode_type mask = source.my_mask.load(std::memory_order_relaxed);
+        if( this->my_mask.load(std::memory_order_relaxed) == mask ) { // optimized version
+            this->reserve(source.my_size.load(std::memory_order_relaxed)); // TODO: load_factor?
+            bucket *dst = 0, *src = 0;
+            bool rehash_required = false;
+            for( hashcode_type k = 0; k <= mask; k++ ) {
+                if( k & (k-2) ) ++dst,src++; // not the beginning of a segment
+                else { dst = this->get_bucket( k ); src = source.get_bucket( k ); }
+                __TBB_ASSERT( dst->node_list.load(std::memory_order_relaxed) != rehash_req, "Invalid bucket in destination table");
+                node *n = static_cast<node*>( src->node_list.load(std::memory_order_relaxed) );
+                if( n == rehash_req ) { // source is not rehashed, items are in previous buckets
+                    rehash_required = true;
+                    dst->node_list.store(rehash_req, std::memory_order_relaxed);
+                } else for(; n; n = static_cast<node*>( n->next ) ) {
+                    node* node_ptr = create_node(base_type::get_allocator(), n->value().first, n->value().second);
+                    this->add_to_bucket( dst, node_ptr);
+                    this->my_size.fetch_add(1, std::memory_order_relaxed);
+                }
+            }
+            if( rehash_required ) rehash();
+        } else internal_copy(source.begin(), source.end(), source.my_size.load(std::memory_order_relaxed));
+    }
+
+    template <typename I>
+    void internal_copy( I first, I last, size_type reserve_size ) {
+        this->reserve(reserve_size); // TODO: load_factor?
+        hashcode_type m = this->my_mask.load(std::memory_order_relaxed);
+        for(; first != last; ++first) {
+            hashcode_type h = my_hash_compare.hash( (*first).first );
+            bucket *b = this->get_bucket( h & m );
+            __TBB_ASSERT( b->node_list.load(std::memory_order_relaxed) != rehash_req, "Invalid bucket in destination table");
+            node* node_ptr = create_node(base_type::get_allocator(), (*first).first, (*first).second);
+            this->add_to_bucket( b, node_ptr );
+            ++this->my_size; // TODO: replace by non-atomic op
+        }
+    }
+
+    void internal_move_construct_with_allocator( concurrent_hash_map&& other, const allocator_type&,
+                                                /*is_always_equal=*/std::true_type )
+    {
+        this->internal_move(std::move(other));
+    }
+
+    void internal_move_construct_with_allocator( concurrent_hash_map&& other, const allocator_type& a,
+                                                /*is_always_equal=*/std::false_type )
+    {
+        if (a == other.get_allocator()){
+            this->internal_move(std::move(other));
+        } else {
+            try_call( [&] {
+                internal_copy(std::make_move_iterator(other.begin()), std::make_move_iterator(other.end()),
+                    other.size());
+            }).on_exception( [&] {
+                this->clear();
+            });
+        }
+    }
+
+    void internal_move_assign( concurrent_hash_map&& other,
+        /*is_always_equal || POCMA = */std::true_type)
+    {
+        this->internal_move(std::move(other));
+    }
+
+    void internal_move_assign(concurrent_hash_map&& other, /*is_always_equal=*/ std::false_type) {
+        if (this->my_allocator == other.my_allocator) {
+            this->internal_move(std::move(other));
+        } else {
+            //do per element move
+            internal_copy(std::make_move_iterator(other.begin()), std::make_move_iterator(other.end()),
+                other.size());
+        }
+    }
+
+    void internal_swap(concurrent_hash_map& other, /*is_always_equal || POCS = */ std::true_type) {
+        this->internal_swap_content(other);
+    }
+
+    void internal_swap(concurrent_hash_map& other, /*is_always_equal || POCS = */ std::false_type) {
+        __TBB_ASSERT(this->my_allocator == other.my_allocator, nullptr);
+        this->internal_swap_content(other);
+    }
+
+    // Fast find when no concurrent erasure is used. For internal use inside TBB only!
+    /** Return pointer to item with given key, or nullptr if no such item exists.
+        Must not be called concurrently with erasure operations. */
+    const_pointer internal_fast_find( const Key& key ) const {
+        hashcode_type h = my_hash_compare.hash( key );
+        hashcode_type m = this->my_mask.load(std::memory_order_acquire);
+        node *n;
+    restart:
+        __TBB_ASSERT((m&(m+1))==0, "data structure is invalid");
+        bucket *b = this->get_bucket( h & m );
+        // TODO: actually, notification is unnecessary here, just hiding double-check
+        if( b->node_list.load(std::memory_order_acquire) == rehash_req )
+        {
+            typename bucket::scoped_type lock;
+            if( lock.try_acquire( b->mutex, /*write=*/true ) ) {
+                if( b->node_list.load(std::memory_order_relaxed) == rehash_req)
+                    const_cast<concurrent_hash_map*>(this)->rehash_bucket( b, h & m ); //recursive rehashing
+            }
+            else lock.acquire( b->mutex, /*write=*/false );
+            __TBB_ASSERT(b->node_list.load(std::memory_order_relaxed) != rehash_req,nullptr);
+        }
+        n = search_bucket( key, b );
+        if( n )
+            return n->storage();
+        else if( this->check_mask_race( h, m ) )
+            goto restart;
+        return 0;
+    }
+};
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+template <typename It,
+          typename HashCompare = tbb_hash_compare<iterator_key_t<It>>,
+          typename Alloc = tbb_allocator<iterator_alloc_pair_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<HashCompare>>>
+concurrent_hash_map( It, It, HashCompare = HashCompare(), Alloc = Alloc() )
+-> concurrent_hash_map<iterator_key_t<It>, iterator_mapped_t<It>, HashCompare, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_hash_map( It, It, Alloc )
+-> concurrent_hash_map<iterator_key_t<It>, iterator_mapped_t<It>, tbb_hash_compare<iterator_key_t<It>>, Alloc>;
+
+template <typename Key, typename T,
+          typename HashCompare = tbb_hash_compare<std::remove_const_t<Key>>,
+          typename Alloc = tbb_allocator<std::pair<const Key, T>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<HashCompare>>>
+concurrent_hash_map( std::initializer_list<std::pair<Key, T>>, HashCompare = HashCompare(), Alloc = Alloc() )
+-> concurrent_hash_map<std::remove_const_t<Key>, T, HashCompare, Alloc>;
+
+template <typename Key, typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_hash_map( std::initializer_list<std::pair<Key, T>>, Alloc )
+-> concurrent_hash_map<std::remove_const_t<Key>, T, tbb_hash_compare<std::remove_const_t<Key>>, Alloc>;
+
+#endif /* __TBB_CPP17_DEDUCTION_GUIDES_PRESENT */
+
+template <typename Key, typename T, typename HashCompare, typename A1, typename A2>
+inline bool operator==(const concurrent_hash_map<Key, T, HashCompare, A1> &a, const concurrent_hash_map<Key, T, HashCompare, A2> &b) {
+    if(a.size() != b.size()) return false;
+    typename concurrent_hash_map<Key, T, HashCompare, A1>::const_iterator i(a.begin()), i_end(a.end());
+    typename concurrent_hash_map<Key, T, HashCompare, A2>::const_iterator j, j_end(b.end());
+    for(; i != i_end; ++i) {
+        j = b.equal_range(i->first).first;
+        if( j == j_end || !(i->second == j->second) ) return false;
+    }
+    return true;
+}
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+template <typename Key, typename T, typename HashCompare, typename A1, typename A2>
+inline bool operator!=(const concurrent_hash_map<Key, T, HashCompare, A1> &a, const concurrent_hash_map<Key, T, HashCompare, A2> &b)
+{    return !(a == b); }
+#endif // !__TBB_CPP20_COMPARISONS_PRESENT
+
+template <typename Key, typename T, typename HashCompare, typename A>
+inline void swap(concurrent_hash_map<Key, T, HashCompare, A> &a, concurrent_hash_map<Key, T, HashCompare, A> &b)
+{    a.swap( b ); }
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+    using detail::split;
+    using detail::d1::concurrent_hash_map;
+    using detail::d1::tbb_hash_compare;
+} // namespace v1
+
+} // namespace tbb
+
+#endif /* __TBB_concurrent_hash_map_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/concurrent_lru_cache.h b/contrib/libs/tbb/include/oneapi/tbb/concurrent_lru_cache.h
new file mode 100644
index 0000000000..b83dd5f8c1
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/concurrent_lru_cache.h
@@ -0,0 +1,364 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_lru_cache_H
+#define __TBB_concurrent_lru_cache_H
+
+#if ! TBB_PREVIEW_CONCURRENT_LRU_CACHE
+    #error Set TBB_PREVIEW_CONCURRENT_LRU_CACHE to include concurrent_lru_cache.h
+#endif
+
+#include "detail/_assert.h"
+#include "detail/_aggregator.h"
+
+#include <map>       // for std::map
+#include <list>      // for std::list
+#include <utility>   // for std::make_pair
+#include <algorithm> // for std::find
+#include <atomic>    // for std::atomic<bool>
+
+namespace tbb {
+
+namespace detail {
+namespace d1 {
+
+//-----------------------------------------------------------------------------
+// Concurrent LRU cache
+//-----------------------------------------------------------------------------
+
+template<typename KeyT, typename ValT, typename KeyToValFunctorT = ValT (*) (KeyT)>
+class concurrent_lru_cache : no_assign {
+// incapsulated helper classes
+private:
+    struct handle_object;
+    struct storage_map_value_type;
+
+    struct aggregator_operation;
+    struct retrieve_aggregator_operation;
+    struct signal_end_of_usage_aggregator_operation;
+
+// typedefs
+public:
+    using key_type = KeyT;
+    using value_type = ValT;
+    using pointer = ValT*;
+    using reference = ValT&;
+    using const_pointer = const ValT*;
+    using const_reference = const ValT&;
+
+    using value_function_type = KeyToValFunctorT;
+    using handle = handle_object;
+private:
+    using lru_cache_type = concurrent_lru_cache<KeyT, ValT, KeyToValFunctorT>;
+
+    using storage_map_type = std::map<key_type, storage_map_value_type>;
+    using storage_map_iterator_type = typename storage_map_type::iterator;
+    using storage_map_pointer_type = typename storage_map_type::pointer;
+    using storage_map_reference_type = typename storage_map_type::reference;
+
+    using history_list_type = std::list<storage_map_iterator_type>;
+    using history_list_iterator_type = typename history_list_type::iterator;
+
+    using aggregator_operation_type = aggregator_operation;
+    using aggregator_function_type = aggregating_functor<lru_cache_type, aggregator_operation_type>;
+    using aggregator_type = aggregator<aggregator_function_type, aggregator_operation_type>;
+
+    friend class aggregating_functor<lru_cache_type,aggregator_operation_type>;
+
+// fields
+private:
+    value_function_type my_value_function;
+    aggregator_type my_aggregator;
+
+    storage_map_type my_storage_map;            // storage map for used objects
+    history_list_type my_history_list;          // history list for unused objects
+    const std::size_t my_history_list_capacity; // history list's allowed capacity
+
+// interface
+public:
+
+    concurrent_lru_cache(value_function_type value_function, std::size_t cache_capacity)
+        : my_value_function(value_function), my_history_list_capacity(cache_capacity) {
+        my_aggregator.initialize_handler(aggregator_function_type(this));
+    }
+
+    handle operator[](key_type key) {
+        retrieve_aggregator_operation op(key);
+        my_aggregator.execute(&op);
+
+        if (op.is_new_value_needed()) {
+            op.result().second.my_value = my_value_function(key);
+            op.result().second.my_is_ready.store(true, std::memory_order_release);
+        } else {
+            spin_wait_while_eq(op.result().second.my_is_ready, false);
+        }
+
+        return handle(*this, op.result());
+    }
+
+private:
+
+    void handle_operations(aggregator_operation* op_list) {
+        while (op_list) {
+            op_list->cast_and_handle(*this);
+            aggregator_operation* prev_op = op_list;
+            op_list = op_list->next;
+
+            (prev_op->status).store(1, std::memory_order_release);
+        }
+    }
+
+    void signal_end_of_usage(storage_map_reference_type map_record_ref) {
+        signal_end_of_usage_aggregator_operation op(map_record_ref);
+        my_aggregator.execute(&op);
+    }
+
+    void signal_end_of_usage_serial(storage_map_reference_type map_record_ref) {
+        storage_map_iterator_type map_it = my_storage_map.find(map_record_ref.first);
+
+        __TBB_ASSERT(map_it != my_storage_map.end(),
+            "cache should not return past-end iterators to outer world");
+        __TBB_ASSERT(&(*map_it) == &map_record_ref,
+            "dangling reference has been returned to outside world: data race?");
+        __TBB_ASSERT(std::find(my_history_list.begin(), my_history_list.end(), map_it) == my_history_list.end(),
+            "object in use should not be in list of unused objects ");
+
+        // if it was the last reference, put it to the LRU history
+        if (! --(map_it->second.my_ref_counter)) {
+            // if the LRU history is full, evict the oldest items to get space
+            if (my_history_list.size() >= my_history_list_capacity) {
+                std::size_t number_of_elements_to_evict = 1 + my_history_list.size() - my_history_list_capacity;
+
+                for (std::size_t i = 0; i < number_of_elements_to_evict; ++i) {
+                    storage_map_iterator_type map_it_to_evict = my_history_list.back();
+
+                    __TBB_ASSERT(map_it_to_evict->second.my_ref_counter == 0,
+                        "item to be evicted should not have a live references");
+
+                    // TODO: can we use forward_list instead of list? pop_front / insert_after last
+                    my_history_list.pop_back();
+                    my_storage_map.erase(map_it_to_evict);
+                }
+            }
+
+            // TODO: can we use forward_list instead of list? pop_front / insert_after last
+            my_history_list.push_front(map_it);
+            map_it->second.my_history_list_iterator = my_history_list.begin();
+        }
+    }
+
+    storage_map_reference_type retrieve_serial(key_type key, bool& is_new_value_needed) {
+        storage_map_iterator_type map_it = my_storage_map.find(key);
+
+        if (map_it == my_storage_map.end()) {
+            map_it = my_storage_map.emplace_hint(
+                map_it, std::piecewise_construct, std::make_tuple(key), std::make_tuple(value_type(), 0, my_history_list.end(), false));
+            is_new_value_needed = true;
+        } else {
+            history_list_iterator_type list_it = map_it->second.my_history_list_iterator;
+            if (list_it != my_history_list.end()) {
+                __TBB_ASSERT(map_it->second.my_ref_counter == 0,
+                    "item to be evicted should not have a live references");
+
+                // Item is going to be used. Therefore it is not a subject for eviction,
+                // so we remove it from LRU history.
+                my_history_list.erase(list_it);
+                map_it->second.my_history_list_iterator = my_history_list.end();
+            }
+        }
+
+        ++(map_it->second.my_ref_counter);
+        return *map_it;
+    }
+};
+
+//-----------------------------------------------------------------------------
+// Value type for storage map in concurrent LRU cache
+//-----------------------------------------------------------------------------
+
+template<typename KeyT, typename ValT, typename KeyToValFunctorT>
+struct concurrent_lru_cache<KeyT, ValT, KeyToValFunctorT>::storage_map_value_type {
+//typedefs
+public:
+    using ref_counter_type = std::size_t;
+
+// fields
+public:
+    value_type my_value;
+    ref_counter_type my_ref_counter;
+    history_list_iterator_type my_history_list_iterator;
+    std::atomic<bool> my_is_ready;
+
+// interface
+public:
+    storage_map_value_type(
+        value_type const& value, ref_counter_type ref_counter,
+        history_list_iterator_type history_list_iterator, bool is_ready)
+        : my_value(value), my_ref_counter(ref_counter),
+          my_history_list_iterator(history_list_iterator), my_is_ready(is_ready) {}
+};
+
+//-----------------------------------------------------------------------------
+// Handle object for operator[] in concurrent LRU cache
+//-----------------------------------------------------------------------------
+
+template<typename KeyT, typename ValT, typename KeyToValFunctorT>
+struct concurrent_lru_cache<KeyT, ValT, KeyToValFunctorT>::handle_object {
+// fields
+private:
+    lru_cache_type* my_lru_cache_ptr;
+    storage_map_pointer_type my_map_record_ptr;
+
+// interface
+public:
+    handle_object()
+        : my_lru_cache_ptr(nullptr), my_map_record_ptr(nullptr) {}
+    handle_object(lru_cache_type& lru_cache_ref, storage_map_reference_type map_record_ref)
+        : my_lru_cache_ptr(&lru_cache_ref), my_map_record_ptr(&map_record_ref) {}
+
+    handle_object(handle_object&) = delete;
+    void operator=(handle_object&) = delete;
+
+    handle_object(handle_object&& other)
+        : my_lru_cache_ptr(other.my_lru_cache_ptr), my_map_record_ptr(other.my_map_record_ptr) {
+
+        __TBB_ASSERT(
+            bool(other.my_lru_cache_ptr) == bool(other.my_map_record_ptr),
+            "invalid state of moving object?");
+
+        other.my_lru_cache_ptr = nullptr;
+        other.my_map_record_ptr = nullptr;
+    }
+
+    handle_object& operator=(handle_object&& other) {
+        __TBB_ASSERT(
+            bool(other.my_lru_cache_ptr) == bool(other.my_map_record_ptr),
+            "invalid state of moving object?");
+
+        if (my_lru_cache_ptr)
+            my_lru_cache_ptr->signal_end_of_usage(*my_map_record_ptr);
+
+        my_lru_cache_ptr = other.my_lru_cache_ptr;
+        my_map_record_ptr = other.my_map_record_ptr;
+        other.my_lru_cache_ptr = nullptr;
+        other.my_map_record_ptr = nullptr;
+
+        return *this;
+    }
+
+    ~handle_object() {
+        if (my_lru_cache_ptr)
+            my_lru_cache_ptr->signal_end_of_usage(*my_map_record_ptr);
+    }
+
+    operator bool() const {
+        return (my_lru_cache_ptr && my_map_record_ptr);
+    }
+
+    value_type& value() {
+        __TBB_ASSERT(my_lru_cache_ptr, "get value from already moved object?");
+        __TBB_ASSERT(my_map_record_ptr, "get value from an invalid or already moved object?");
+
+        return my_map_record_ptr->second.my_value;
+    }
+};
+
+//-----------------------------------------------------------------------------
+// Aggregator operation for aggregator type in concurrent LRU cache
+//-----------------------------------------------------------------------------
+
+template<typename KeyT, typename ValT, typename KeyToValFunctorT>
+struct concurrent_lru_cache<KeyT, ValT, KeyToValFunctorT>::aggregator_operation
+    : aggregated_operation<aggregator_operation> {
+// incapsulated helper classes
+public:
+    enum class op_type { retrieve, signal_end_of_usage };
+
+// fields
+private:
+    op_type my_op;
+
+// interface
+public:
+    aggregator_operation(op_type op) : my_op(op) {}
+
+    // TODO: aggregator_operation can be implemented
+    //   - as a statically typed variant type or CRTP? (static, dependent on the use case)
+    //   - or use pointer to function and apply_visitor (dynamic)
+    //   - or use virtual functions (dynamic)
+    void cast_and_handle(lru_cache_type& lru_cache_ref) {
+        if (my_op == op_type::retrieve)
+            static_cast<retrieve_aggregator_operation*>(this)->handle(lru_cache_ref);
+        else
+            static_cast<signal_end_of_usage_aggregator_operation*>(this)->handle(lru_cache_ref);
+    }
+};
+
+template<typename KeyT, typename ValT, typename KeyToValFunctorT>
+struct concurrent_lru_cache<KeyT, ValT, KeyToValFunctorT>::retrieve_aggregator_operation
+    : aggregator_operation, private no_assign {
+public:
+    key_type my_key;
+    storage_map_pointer_type my_map_record_ptr;
+    bool my_is_new_value_needed;
+
+public:
+    retrieve_aggregator_operation(key_type key)
+        : aggregator_operation(aggregator_operation::op_type::retrieve),
+          my_key(key), my_is_new_value_needed(false) {}
+
+    void handle(lru_cache_type& lru_cache_ref) {
+        my_map_record_ptr = &lru_cache_ref.retrieve_serial(my_key, my_is_new_value_needed);
+    }
+
+    storage_map_reference_type result() { return *my_map_record_ptr; }
+
+    bool is_new_value_needed() { return my_is_new_value_needed; }
+};
+
+template<typename KeyT, typename ValT, typename KeyToValFunctorT>
+struct concurrent_lru_cache<KeyT, ValT, KeyToValFunctorT>::signal_end_of_usage_aggregator_operation
+    : aggregator_operation, private no_assign {
+
+private:
+    storage_map_reference_type my_map_record_ref;
+
+public:
+    signal_end_of_usage_aggregator_operation(storage_map_reference_type map_record_ref)
+        : aggregator_operation(aggregator_operation::op_type::signal_end_of_usage),
+          my_map_record_ref(map_record_ref) {}
+
+    void handle(lru_cache_type& lru_cache_ref) {
+        lru_cache_ref.signal_end_of_usage_serial(my_map_record_ref);
+    }
+};
+
+// TODO: if we have guarantees that KeyToValFunctorT always have
+//       ValT as a return type and KeyT as an argument type
+//       we can deduce template parameters of concurrent_lru_cache
+//       by pattern matching on KeyToValFunctorT
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+
+using detail::d1::concurrent_lru_cache;
+
+} // inline namespace v1
+} // namespace tbb
+
+#endif // __TBB_concurrent_lru_cache_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/concurrent_map.h b/contrib/libs/tbb/include/oneapi/tbb/concurrent_map.h
new file mode 100644
index 0000000000..ae389d4f42
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/concurrent_map.h
@@ -0,0 +1,342 @@
+/*
+    Copyright (c) 2019-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_map_H
+#define __TBB_concurrent_map_H
+
+#include "detail/_namespace_injection.h"
+#include "detail/_concurrent_skip_list.h"
+#include "tbb_allocator.h"
+#include <functional>
+#include <tuple>
+#include <utility>
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template<typename Key, typename Value, typename KeyCompare, typename RandomGenerator,
+         typename Allocator, bool AllowMultimapping>
+struct map_traits {
+    static constexpr std::size_t max_level = RandomGenerator::max_level;
+    using random_level_generator_type = RandomGenerator;
+    using key_type = Key;
+    using mapped_type = Value;
+    using compare_type = KeyCompare;
+    using value_type = std::pair<const key_type, mapped_type>;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+    using allocator_type = Allocator;
+
+    static constexpr bool allow_multimapping = AllowMultimapping;
+
+    class value_compare {
+    public:
+        bool operator()(const value_type& lhs, const value_type& rhs) const {
+            return comp(lhs.first, rhs.first);
+        }
+
+    protected:
+        value_compare(compare_type c) : comp(c) {}
+
+        friend struct map_traits;
+
+        compare_type comp;
+    };
+
+    static value_compare value_comp(compare_type comp) { return value_compare(comp); }
+
+    static const key_type& get_key(const_reference val) {
+        return val.first;
+    }
+}; // struct map_traits
+
+template <typename Key, typename Value, typename Compare, typename Allocator>
+class concurrent_multimap;
+
+template <typename Key, typename Value, typename Compare = std::less<Key>, typename Allocator = tbb::tbb_allocator<std::pair<const Key, Value>>>
+class concurrent_map : public concurrent_skip_list<map_traits<Key, Value, Compare, concurrent_geometric_level_generator<32>, Allocator, false>> {
+    using base_type = concurrent_skip_list<map_traits<Key, Value, Compare, concurrent_geometric_level_generator<32>, Allocator, false>>;
+public:
+    using key_type = Key;
+    using mapped_type = Value;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using key_compare = Compare;
+    using value_compare = typename base_type::value_compare;
+    using allocator_type = Allocator;
+
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base type
+    using base_type::base_type;
+    using base_type::operator=;
+
+    // Required for implicit deduction guides
+    concurrent_map() = default;
+    concurrent_map( const concurrent_map& ) = default;
+    concurrent_map( const concurrent_map& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_map( concurrent_map&& ) = default;
+    concurrent_map( concurrent_map&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_map& operator=( const concurrent_map& ) = default;
+    concurrent_map& operator=( concurrent_map&& ) = default;
+
+    // Observers
+    mapped_type& at(const key_type& key) {
+        iterator it = this->find(key);
+
+        if (it == this->end()) {
+            throw_exception(exception_id::invalid_key);
+        }
+        return it->second;
+    }
+
+    const mapped_type& at(const key_type& key) const {
+        return const_cast<concurrent_map*>(this)->at(key);
+    }
+
+    mapped_type& operator[](const key_type& key) {
+        iterator it = this->find(key);
+
+        if (it == this->end()) {
+            it = this->emplace(std::piecewise_construct, std::forward_as_tuple(key), std::tuple<>()).first;
+        }
+        return it->second;
+    }
+
+    mapped_type& operator[](key_type&& key) {
+        iterator it = this->find(key);
+
+        if (it == this->end()) {
+            it = this->emplace(std::piecewise_construct, std::forward_as_tuple(std::move(key)), std::tuple<>()).first;
+        }
+        return it->second;
+    }
+
+    using base_type::insert;
+
+    template <typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            std::pair<iterator, bool>>::type insert( P&& value )
+    {
+        return this->emplace(std::forward<P>(value));
+    }
+
+    template <typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            iterator>::type insert( const_iterator hint, P&& value )
+    {
+        return this->emplace_hint(hint, std::forward<P>(value));
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_map<key_type, mapped_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_map<key_type, mapped_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multimap<key_type, mapped_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multimap<key_type, mapped_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_map
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename It,
+          typename Comp = std::less<iterator_key_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_alloc_pair_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_map( It, It, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_map<iterator_key_t<It>, iterator_mapped_t<It>, Comp, Alloc>;
+
+template <typename Key, typename T,
+          typename Comp = std::less<std::remove_const_t<Key>>,
+          typename Alloc = tbb::tbb_allocator<std::pair<const Key, T>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_map( std::initializer_list<std::pair<Key, T>>, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_map<std::remove_const_t<Key>, T, Comp, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_map( It, It, Alloc )
+-> concurrent_map<iterator_key_t<It>, iterator_mapped_t<It>,
+                  std::less<iterator_key_t<It>>, Alloc>;
+
+template <typename Key, typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_map( std::initializer_list<std::pair<Key, T>>, Alloc )
+-> concurrent_map<std::remove_const_t<Key>, T, std::less<std::remove_const_t<Key>>, Alloc>;
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename Value, typename Compare, typename Allocator>
+void swap( concurrent_map<Key, Value, Compare, Allocator>& lhs,
+           concurrent_map<Key, Value, Compare, Allocator>& rhs )
+{
+    lhs.swap(rhs);
+}
+
+template <typename Key, typename Value, typename Compare = std::less<Key>, typename Allocator = tbb::tbb_allocator<std::pair<const Key, Value>>>
+class concurrent_multimap : public concurrent_skip_list<map_traits<Key, Value, Compare, concurrent_geometric_level_generator<32>, Allocator, true>> {
+    using base_type = concurrent_skip_list<map_traits<Key, Value, Compare, concurrent_geometric_level_generator<32>, Allocator, true>>;
+public:
+    using key_type = Key;
+    using mapped_type = Value;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using key_compare = Compare;
+    using value_compare = typename base_type::value_compare;
+    using allocator_type = Allocator;
+
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base_type
+    using base_type::base_type;
+    using base_type::insert;
+    using base_type::operator=;
+
+    // Required for implicit deduction guides
+    concurrent_multimap() = default;
+    concurrent_multimap( const concurrent_multimap& ) = default;
+    concurrent_multimap( const concurrent_multimap& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_multimap( concurrent_multimap&& ) = default;
+    concurrent_multimap( concurrent_multimap&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_multimap& operator=( const concurrent_multimap& ) = default;
+    concurrent_multimap& operator=( concurrent_multimap&& ) = default;
+
+    template <typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            std::pair<iterator, bool>>::type insert( P&& value )
+    {
+        return this->emplace(std::forward<P>(value));
+    }
+
+    template <typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            iterator>::type insert( const_iterator hint, P&& value )
+    {
+        return this->emplace_hint(hint, std::forward<P>(value));
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multimap<key_type, mapped_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multimap<key_type, mapped_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_map<key_type, mapped_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_map<key_type, mapped_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_multimap
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename It,
+          typename Comp = std::less<iterator_key_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_alloc_pair_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_multimap( It, It, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_multimap<iterator_key_t<It>, iterator_mapped_t<It>, Comp, Alloc>;
+
+template <typename Key, typename T,
+          typename Comp = std::less<std::remove_const_t<Key>>,
+          typename Alloc = tbb::tbb_allocator<std::pair<const Key, T>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_multimap( std::initializer_list<std::pair<Key, T>>, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_multimap<std::remove_const_t<Key>, T, Comp, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_multimap( It, It, Alloc )
+-> concurrent_multimap<iterator_key_t<It>, iterator_mapped_t<It>,
+                       std::less<iterator_key_t<It>>, Alloc>;
+
+template <typename Key, typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_multimap( std::initializer_list<std::pair<Key, T>>, Alloc )
+-> concurrent_multimap<std::remove_const_t<Key>, T, std::less<std::remove_const_t<Key>>, Alloc>;
+
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename Value, typename Compare, typename Allocator>
+void swap( concurrent_multimap<Key, Value, Compare, Allocator>& lhs,
+           concurrent_multimap<Key, Value, Compare, Allocator>& rhs )
+{
+    lhs.swap(rhs);
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+
+using detail::d1::concurrent_map;
+using detail::d1::concurrent_multimap;
+using detail::split;
+
+} // inline namespace v1
+} // namespace tbb
+
+#endif // __TBB_concurrent_map_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/concurrent_priority_queue.h b/contrib/libs/tbb/include/oneapi/tbb/concurrent_priority_queue.h
new file mode 100644
index 0000000000..a281740ad8
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/concurrent_priority_queue.h
@@ -0,0 +1,490 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_priority_queue_H
+#define __TBB_concurrent_priority_queue_H
+
+#include "detail/_namespace_injection.h"
+#include "detail/_aggregator.h"
+#include "detail/_template_helpers.h"
+#include "detail/_allocator_traits.h"
+#include "detail/_range_common.h"
+#include "detail/_exception.h"
+#include "detail/_utils.h"
+#include "detail/_containers_helpers.h"
+#include "cache_aligned_allocator.h"
+#include <vector>
+#include <iterator>
+#include <functional>
+#include <utility>
+#include <initializer_list>
+#include <type_traits>
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename T, typename Compare = std::less<T>, typename Allocator = cache_aligned_allocator<T>>
+class concurrent_priority_queue {
+public:
+    using value_type = T;
+    using reference = T&;
+    using const_reference = const T&;
+
+    using size_type = std::size_t;
+    using difference_type = std::ptrdiff_t;
+
+    using allocator_type = Allocator;
+
+    concurrent_priority_queue() : concurrent_priority_queue(allocator_type{}) {}
+
+    explicit concurrent_priority_queue( const allocator_type& alloc )
+        : mark(0), my_size(0), my_compare(), data(alloc)
+    {
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    explicit concurrent_priority_queue( const Compare& compare, const allocator_type& alloc = allocator_type() )
+        : mark(0), my_size(0), my_compare(compare), data(alloc)
+    {
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    explicit concurrent_priority_queue( size_type init_capacity, const allocator_type& alloc = allocator_type() )
+        : mark(0), my_size(0), my_compare(), data(alloc)
+    {
+        data.reserve(init_capacity);
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    explicit concurrent_priority_queue( size_type init_capacity, const Compare& compare, const allocator_type& alloc = allocator_type() )
+        : mark(0), my_size(0), my_compare(compare), data(alloc)
+    {
+        data.reserve(init_capacity);
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    template <typename InputIterator>
+    concurrent_priority_queue( InputIterator begin, InputIterator end, const Compare& compare, const allocator_type& alloc = allocator_type() )
+        : mark(0), my_compare(compare), data(begin, end, alloc)
+    {
+        my_aggregator.initialize_handler(functor{this});
+        heapify();
+        my_size.store(data.size(), std::memory_order_relaxed);
+    }
+
+    template <typename InputIterator>
+    concurrent_priority_queue( InputIterator begin, InputIterator end, const allocator_type& alloc = allocator_type() )
+        : concurrent_priority_queue(begin, end, Compare(), alloc) {}
+
+    concurrent_priority_queue( std::initializer_list<value_type> init, const Compare& compare, const allocator_type& alloc = allocator_type() )
+        : concurrent_priority_queue(init.begin(), init.end(), compare, alloc) {}
+
+    concurrent_priority_queue( std::initializer_list<value_type> init, const allocator_type& alloc = allocator_type() )
+        : concurrent_priority_queue(init, Compare(), alloc) {}
+
+    concurrent_priority_queue( const concurrent_priority_queue& other )
+        : mark(other.mark), my_size(other.my_size.load(std::memory_order_relaxed)), my_compare(other.my_compare),
+          data(other.data)
+    {
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    concurrent_priority_queue( const concurrent_priority_queue& other, const allocator_type& alloc )
+        : mark(other.mark), my_size(other.my_size.load(std::memory_order_relaxed)), my_compare(other.my_compare),
+          data(other.data, alloc)
+    {
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    concurrent_priority_queue( concurrent_priority_queue&& other )
+        : mark(other.mark), my_size(other.my_size.load(std::memory_order_relaxed)), my_compare(other.my_compare),
+          data(std::move(other.data))
+    {
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    concurrent_priority_queue( concurrent_priority_queue&& other, const allocator_type& alloc )
+        : mark(other.mark), my_size(other.my_size.load(std::memory_order_relaxed)), my_compare(other.my_compare),
+          data(std::move(other.data), alloc)
+    {
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    concurrent_priority_queue& operator=( const concurrent_priority_queue& other ) {
+        if (this != &other) {
+            data = other.data;
+            mark = other.mark;
+            my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        }
+        return *this;
+    }
+
+    concurrent_priority_queue& operator=( concurrent_priority_queue&& other ) {
+        if (this != &other) {
+            // TODO: check if exceptions from std::vector::operator=(vector&&) should be handled separately
+            data = std::move(other.data);
+            mark = other.mark;
+            my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        }
+        return *this;
+    }
+
+    concurrent_priority_queue& operator=( std::initializer_list<value_type> init ) {
+        assign(init.begin(), init.end());
+        return *this;
+    }
+
+    template <typename InputIterator>
+    void assign( InputIterator begin, InputIterator end ) {
+        data.assign(begin, end);
+        mark = 0;
+        my_size.store(data.size(), std::memory_order_relaxed);
+        heapify();
+    }
+
+    void assign( std::initializer_list<value_type> init ) {
+        assign(init.begin(), init.end());
+    }
+
+    /* Returned value may not reflect results of pending operations.
+       This operation reads shared data and will trigger a race condition. */
+    __TBB_nodiscard bool empty() const { return size() == 0; }
+
+    // Returns the current number of elements contained in the queue
+    /* Returned value may not reflect results of pending operations.
+       This operation reads shared data and will trigger a race condition. */
+    size_type size() const { return my_size.load(std::memory_order_relaxed); }
+
+    /* This operation can be safely used concurrently with other push, try_pop or emplace operations. */
+    void push( const value_type& value ) {
+        cpq_operation op_data(value, PUSH_OP);
+        my_aggregator.execute(&op_data);
+        if (op_data.status == FAILED)
+            throw_exception(exception_id::bad_alloc);
+    }
+
+    /* This operation can be safely used concurrently with other push, try_pop or emplace operations. */
+    void push( value_type&& value ) {
+        cpq_operation op_data(value, PUSH_RVALUE_OP);
+        my_aggregator.execute(&op_data);
+        if (op_data.status == FAILED)
+            throw_exception(exception_id::bad_alloc);
+    }
+
+    /* This operation can be safely used concurrently with other push, try_pop or emplace operations. */
+    template <typename... Args>
+    void emplace( Args&&... args ) {
+        // TODO: support uses allocator construction in this place
+        push(value_type(std::forward<Args>(args)...));
+    }
+
+    // Gets a reference to and removes highest priority element
+    /* If a highest priority element was found, sets elem and returns true,
+       otherwise returns false.
+       This operation can be safely used concurrently with other push, try_pop or emplace operations. */
+    bool try_pop( value_type& value ) {
+        cpq_operation op_data(value, POP_OP);
+        my_aggregator.execute(&op_data);
+        return op_data.status == SUCCEEDED;
+    }
+
+    // This operation affects the whole container => it is not thread-safe
+    void clear() {
+        data.clear();
+        mark = 0;
+        my_size.store(0, std::memory_order_relaxed);
+    }
+
+    // This operation affects the whole container => it is not thread-safe
+    void swap( concurrent_priority_queue& other ) {
+        if (this != &other) {
+            using std::swap;
+            swap(data, other.data);
+            swap(mark, other.mark);
+
+            size_type sz = my_size.load(std::memory_order_relaxed);
+            my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+            other.my_size.store(sz, std::memory_order_relaxed);
+        }
+    }
+
+    allocator_type get_allocator() const { return data.get_allocator(); }
+private:
+    enum operation_type {INVALID_OP, PUSH_OP, POP_OP, PUSH_RVALUE_OP};
+    enum operation_status {WAIT = 0, SUCCEEDED, FAILED};
+
+    class cpq_operation : public aggregated_operation<cpq_operation> {
+    public:
+        operation_type type;
+        union {
+            value_type* elem;
+            size_type sz;
+        };
+        cpq_operation( const value_type& value, operation_type t )
+            : type(t), elem(const_cast<value_type*>(&value)) {}
+    }; // class cpq_operation
+
+    class functor {
+        concurrent_priority_queue* my_cpq;
+    public:
+        functor() : my_cpq(nullptr) {}
+        functor( concurrent_priority_queue* cpq ) : my_cpq(cpq) {}
+
+        void operator()(cpq_operation* op_list) {
+            __TBB_ASSERT(my_cpq != nullptr, "Invalid functor");
+            my_cpq->handle_operations(op_list);
+        }
+    }; // class functor
+
+    void handle_operations( cpq_operation* op_list ) {
+        call_itt_notify(acquired, this);
+        cpq_operation* tmp, *pop_list = nullptr;
+        __TBB_ASSERT(mark == data.size(), NULL);
+
+        // First pass processes all constant (amortized; reallocation may happen) time pushes and pops.
+        while(op_list) {
+            // ITT note: &(op_list->status) tag is used to cover accesses to op_list
+            // node. This thread is going to handle the operation, and so will acquire it
+            // and perform the associated operation w/o triggering a race condition; the
+            // thread that created the operation is waiting on the status field, so when
+            // this thread is done with the operation, it will perform a
+            // store_with_release to give control back to the waiting thread in
+            // aggregator::insert_operation.
+            // TODO: enable
+            call_itt_notify(acquired, &(op_list->status));
+            __TBB_ASSERT(op_list->type != INVALID_OP, NULL);
+
+            tmp = op_list;
+            op_list = op_list->next.load(std::memory_order_relaxed);
+            if (tmp->type == POP_OP) {
+                if (mark < data.size() &&
+                    my_compare(data[0], data.back()))
+                {
+                    // there are newly pushed elems and the last one is higher than top
+                    *(tmp->elem) = std::move(data.back());
+                    my_size.store(my_size.load(std::memory_order_relaxed) - 1, std::memory_order_relaxed);
+                    tmp->status.store(uintptr_t(SUCCEEDED), std::memory_order_release);
+
+                    data.pop_back();
+                    __TBB_ASSERT(mark <= data.size(), NULL);
+                } else { // no convenient item to pop; postpone
+                    tmp->next.store(pop_list, std::memory_order_relaxed);
+                    pop_list = tmp;
+                }
+            } else { // PUSH_OP or PUSH_RVALUE_OP
+                __TBB_ASSERT(tmp->type == PUSH_OP || tmp->type == PUSH_RVALUE_OP, "Unknown operation");
+#if TBB_USE_EXCEPTIONS
+                try
+#endif
+                {
+                    if (tmp->type == PUSH_OP) {
+                        push_back_helper(*(tmp->elem));
+                    } else {
+                        data.push_back(std::move(*(tmp->elem)));
+                    }
+                    my_size.store(my_size.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed);
+                    tmp->status.store(uintptr_t(SUCCEEDED), std::memory_order_release);
+                }
+#if TBB_USE_EXCEPTIONS
+                catch(...) {
+                    tmp->status.store(uintptr_t(FAILED), std::memory_order_release);
+                }
+#endif
+            }
+        }
+
+        // Second pass processes pop operations
+        while(pop_list) {
+            tmp = pop_list;
+            pop_list = pop_list->next.load(std::memory_order_relaxed);
+            __TBB_ASSERT(tmp->type == POP_OP, NULL);
+            if (data.empty()) {
+                tmp->status.store(uintptr_t(FAILED), std::memory_order_release);
+            } else {
+                __TBB_ASSERT(mark <= data.size(), NULL);
+                if (mark < data.size() &&
+                    my_compare(data[0], data.back()))
+                {
+                    // there are newly pushed elems and the last one is higher than top
+                    *(tmp->elem) = std::move(data.back());
+                    my_size.store(my_size.load(std::memory_order_relaxed) - 1, std::memory_order_relaxed);
+                    tmp->status.store(uintptr_t(SUCCEEDED), std::memory_order_release);
+                    data.pop_back();
+                } else { // extract top and push last element down heap
+                    *(tmp->elem) = std::move(data[0]);
+                    my_size.store(my_size.load(std::memory_order_relaxed) - 1, std::memory_order_relaxed);
+                    tmp->status.store(uintptr_t(SUCCEEDED), std::memory_order_release);
+                    reheap();
+                }
+            }
+        }
+
+        // heapify any leftover pushed elements before doing the next
+        // batch of operations
+        if (mark < data.size()) heapify();
+        __TBB_ASSERT(mark == data.size(), NULL);
+        call_itt_notify(releasing, this);
+    }
+
+    // Merge unsorted elements into heap
+    void heapify() {
+        if (!mark && data.size() > 0) mark = 1;
+        for (; mark < data.size(); ++mark) {
+            // for each unheapified element under size
+            size_type cur_pos = mark;
+            value_type to_place = std::move(data[mark]);
+            do { // push to_place up the heap
+                size_type parent = (cur_pos - 1) >> 1;
+                if (!my_compare(data[parent], to_place))
+                    break;
+                data[cur_pos] = std::move(data[parent]);
+                cur_pos = parent;
+            } while(cur_pos);
+            data[cur_pos] = std::move(to_place);
+        }
+    }
+
+    // Re-heapify after an extraction
+    // Re-heapify by pushing last element down the heap from the root.
+    void reheap() {
+        size_type cur_pos = 0, child = 1;
+
+        while(child < mark) {
+            size_type target = child;
+            if (child + 1 < mark && my_compare(data[child], data[child + 1]))
+                ++target;
+            // target now has the higher priority child
+            if (my_compare(data[target], data.back()))
+                break;
+            data[cur_pos] = std::move(data[target]);
+            cur_pos = target;
+            child = (cur_pos << 1) + 1;
+        }
+        if (cur_pos != data.size() - 1)
+            data[cur_pos] = std::move(data.back());
+        data.pop_back();
+        if (mark > data.size()) mark = data.size();
+    }
+
+    void push_back_helper( const T& value ) {
+        push_back_helper_impl(value, std::is_copy_constructible<T>{});
+    }
+
+    void push_back_helper_impl( const T& value, /*is_copy_constructible = */std::true_type ) {
+        data.push_back(value);
+    }
+
+    void push_back_helper_impl( const T&, /*is_copy_constructible = */std::false_type ) {
+        __TBB_ASSERT(false, "error: calling tbb::concurrent_priority_queue.push(const value_type&) for move-only type");
+    }
+
+    using aggregator_type = aggregator<functor, cpq_operation>;
+
+    aggregator_type my_aggregator;
+    // Padding added to avoid false sharing
+    char padding1[max_nfs_size - sizeof(aggregator_type)];
+    // The point at which unsorted elements begin
+    size_type mark;
+    std::atomic<size_type> my_size;
+    Compare my_compare;
+
+    // Padding added to avoid false sharing
+    char padding2[max_nfs_size - (2*sizeof(size_type)) - sizeof(Compare)];
+    //! Storage for the heap of elements in queue, plus unheapified elements
+    /** data has the following structure:
+
+         binary unheapified
+          heap   elements
+        ____|_______|____
+        |       |       |
+        v       v       v
+        [_|...|_|_|...|_| |...| ]
+         0       ^       ^       ^
+                 |       |       |__capacity
+                 |       |__my_size
+                 |__mark
+
+        Thus, data stores the binary heap starting at position 0 through
+        mark-1 (it may be empty).  Then there are 0 or more elements
+        that have not yet been inserted into the heap, in positions
+        mark through my_size-1. */
+
+    using vector_type = std::vector<value_type, allocator_type>;
+    vector_type data;
+
+    friend bool operator==( const concurrent_priority_queue& lhs,
+                            const concurrent_priority_queue& rhs )
+    {
+        return lhs.data == rhs.data;
+    }
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+    friend bool operator!=( const concurrent_priority_queue& lhs,
+                            const concurrent_priority_queue& rhs )
+    {
+        return !(lhs == rhs);
+    }
+#endif
+}; // class concurrent_priority_queue
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+template <typename It,
+          typename Comp = std::less<iterator_value_t<It>>,
+          typename Alloc = tbb::cache_aligned_allocator<iterator_value_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_priority_queue( It, It, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_priority_queue<iterator_value_t<It>, Comp, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_priority_queue( It, It, Alloc )
+-> concurrent_priority_queue<iterator_value_t<It>, std::less<iterator_value_t<It>>, Alloc>;
+
+template <typename T,
+          typename Comp = std::less<T>,
+          typename Alloc = tbb::cache_aligned_allocator<T>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_priority_queue( std::initializer_list<T>, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_priority_queue<T, Comp, Alloc>;
+
+template <typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_priority_queue( std::initializer_list<T>, Alloc )
+-> concurrent_priority_queue<T, std::less<T>, Alloc>;
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename T, typename Compare, typename Allocator>
+void swap( concurrent_priority_queue<T, Compare, Allocator>& lhs,
+           concurrent_priority_queue<T, Compare, Allocator>& rhs )
+{
+    lhs.swap(rhs);
+}
+
+} // namespace d1
+} // namespace detail
+inline namespace v1 {
+using detail::d1::concurrent_priority_queue;
+
+} // inline namespace v1
+} // namespace tbb
+
+#endif // __TBB_concurrent_priority_queue_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/concurrent_queue.h b/contrib/libs/tbb/include/oneapi/tbb/concurrent_queue.h
new file mode 100644
index 0000000000..c8ae7afff7
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/concurrent_queue.h
@@ -0,0 +1,592 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_queue_H
+#define __TBB_concurrent_queue_H
+
+#include "detail/_namespace_injection.h"
+#include "detail/_concurrent_queue_base.h"
+#include "detail/_allocator_traits.h"
+#include "detail/_exception.h"
+#include "detail/_containers_helpers.h"
+#include "cache_aligned_allocator.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+// A high-performance thread-safe non-blocking concurrent queue.
+// Multiple threads may each push and pop concurrently.
+// Assignment construction is not allowed.
+template <typename T, typename Allocator = tbb::cache_aligned_allocator<T>>
+class concurrent_queue {
+    using allocator_traits_type = tbb::detail::allocator_traits<Allocator>;
+    using queue_representation_type = concurrent_queue_rep<T, Allocator>;
+    using queue_allocator_type = typename allocator_traits_type::template rebind_alloc<queue_representation_type>;
+    using queue_allocator_traits = tbb::detail::allocator_traits<queue_allocator_type>;
+public:
+    using size_type = std::size_t;
+    using value_type = T;
+    using reference = T&;
+    using const_reference = const T&;
+    using difference_type = std::ptrdiff_t;
+
+    using allocator_type = Allocator;
+    using pointer = typename allocator_traits_type::pointer;
+    using const_pointer = typename allocator_traits_type::const_pointer;
+
+    using iterator = concurrent_queue_iterator<concurrent_queue, T, Allocator>;
+    using const_iterator = concurrent_queue_iterator<concurrent_queue, const T, Allocator>;
+
+    concurrent_queue() : concurrent_queue(allocator_type()) {}
+
+    explicit concurrent_queue(const allocator_type& a) :
+        my_allocator(a), my_queue_representation(nullptr)
+    {
+        my_queue_representation = static_cast<queue_representation_type*>(r1::cache_aligned_allocate(sizeof(queue_representation_type)));
+        queue_allocator_traits::construct(my_allocator, my_queue_representation, my_allocator);
+
+        __TBB_ASSERT(is_aligned(my_queue_representation, max_nfs_size), "alignment error" );
+        __TBB_ASSERT(is_aligned(&my_queue_representation->head_counter, max_nfs_size), "alignment error" );
+        __TBB_ASSERT(is_aligned(&my_queue_representation->tail_counter, max_nfs_size), "alignment error" );
+        __TBB_ASSERT(is_aligned(&my_queue_representation->array, max_nfs_size), "alignment error" );
+    }
+
+    template <typename InputIterator>
+    concurrent_queue(InputIterator begin, InputIterator end, const allocator_type& a = allocator_type()) :
+        concurrent_queue(a)
+    {
+        for (; begin != end; ++begin)
+            push(*begin);
+    }
+
+    concurrent_queue(const concurrent_queue& src, const allocator_type& a) :
+        concurrent_queue(a)
+    {
+        my_queue_representation->assign(*src.my_queue_representation, copy_construct_item);
+    }
+
+    concurrent_queue(const concurrent_queue& src) :
+        concurrent_queue(queue_allocator_traits::select_on_container_copy_construction(src.get_allocator()))
+    {
+        my_queue_representation->assign(*src.my_queue_representation, copy_construct_item);
+    }
+
+    // Move constructors
+    concurrent_queue(concurrent_queue&& src) :
+        concurrent_queue(std::move(src.my_allocator))
+    {
+        internal_swap(src);
+    }
+
+    concurrent_queue(concurrent_queue&& src, const allocator_type& a) :
+        concurrent_queue(a)
+    {
+        // checking that memory allocated by one instance of allocator can be deallocated
+        // with another
+        if (my_allocator == src.my_allocator) {
+            internal_swap(src);
+        } else {
+            // allocators are different => performing per-element move
+            my_queue_representation->assign(*src.my_queue_representation, move_construct_item);
+            src.clear();
+        }
+    }
+
+    // Destroy queue
+    ~concurrent_queue() {
+        clear();
+        my_queue_representation->clear();
+        queue_allocator_traits::destroy(my_allocator, my_queue_representation);
+        r1::cache_aligned_deallocate(my_queue_representation);
+    }
+
+    // Enqueue an item at tail of queue.
+    void push(const T& value) {
+        internal_push(value);
+    }
+
+    void push(T&& value) {
+        internal_push(std::move(value));
+    }
+
+    template <typename... Args>
+    void emplace( Args&&... args ) {
+        internal_push(std::forward<Args>(args)...);
+    }
+
+    // Attempt to dequeue an item from head of queue.
+    /** Does not wait for item to become available.
+        Returns true if successful; false otherwise. */
+    bool try_pop( T& result ) {
+        return internal_try_pop(&result);
+    }
+
+    // Return the number of items in the queue; thread unsafe
+    size_type unsafe_size() const {
+        std::ptrdiff_t size = my_queue_representation->size();
+        return size < 0 ? 0 :  size_type(size);
+    }
+
+    // Equivalent to size()==0.
+    __TBB_nodiscard bool empty() const {
+        return my_queue_representation->empty();
+    }
+
+    // Clear the queue. not thread-safe.
+    void clear() {
+        while (!empty()) {
+            T value;
+            try_pop(value);
+        }
+    }
+
+    // Return allocator object
+    allocator_type get_allocator() const { return my_allocator; }
+
+    //------------------------------------------------------------------------
+    // The iterators are intended only for debugging.  They are slow and not thread safe.
+    //------------------------------------------------------------------------
+
+    iterator unsafe_begin() { return concurrent_queue_iterator_provider::get<iterator>(*this); }
+    iterator unsafe_end() { return iterator(); }
+    const_iterator unsafe_begin() const { return concurrent_queue_iterator_provider::get<const_iterator>(*this); }
+    const_iterator unsafe_end() const { return const_iterator(); }
+    const_iterator unsafe_cbegin() const { return concurrent_queue_iterator_provider::get<const_iterator>(*this); }
+    const_iterator unsafe_cend() const { return const_iterator(); }
+
+private:
+    void internal_swap(concurrent_queue& src) {
+        using std::swap;
+        swap(my_queue_representation, src.my_queue_representation);
+    }
+
+    template <typename... Args>
+    void internal_push( Args&&... args ) {
+        ticket_type k = my_queue_representation->tail_counter++;
+        my_queue_representation->choose(k).push(k, *my_queue_representation, std::forward<Args>(args)...);
+    }
+
+    bool internal_try_pop( void* dst ) {
+        ticket_type k;
+        do {
+            k = my_queue_representation->head_counter.load(std::memory_order_relaxed);
+            do {
+                if (static_cast<std::ptrdiff_t>(my_queue_representation->tail_counter.load(std::memory_order_relaxed) - k) <= 0) {
+                    // Queue is empty
+                    return false;
+                }
+
+                // Queue had item with ticket k when we looked. Attempt to get that item.
+                // Another thread snatched the item, retry.
+            } while (!my_queue_representation->head_counter.compare_exchange_strong(k, k + 1));
+        } while (!my_queue_representation->choose(k).pop(dst, k, *my_queue_representation));
+        return true;
+    }
+
+    template <typename Container, typename Value, typename A>
+    friend class concurrent_queue_iterator;
+
+    static void copy_construct_item(T* location, const void* src) {
+        // TODO: use allocator_traits for copy construction
+        new (location) value_type(*static_cast<const value_type*>(src));
+        // queue_allocator_traits::construct(my_allocator, location, *static_cast<const T*>(src));
+    }
+
+    static void move_construct_item(T* location, const void* src) {
+        // TODO: use allocator_traits for move construction
+        new (location) value_type(std::move(*static_cast<value_type*>(const_cast<void*>(src))));
+    }
+
+    queue_allocator_type my_allocator;
+    queue_representation_type* my_queue_representation;
+}; // class concurrent_queue
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+// Deduction guide for the constructor from two iterators
+template <typename It, typename Alloc = tbb::cache_aligned_allocator<iterator_value_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_queue( It, It, Alloc = Alloc() )
+-> concurrent_queue<iterator_value_t<It>, Alloc>;
+
+#endif /* __TBB_CPP17_DEDUCTION_GUIDES_PRESENT */
+
+class concurrent_monitor;
+
+template <typename FuncType>
+class delegated_function : public delegate_base {
+public:
+    delegated_function(FuncType& f) : my_func(f) {}
+
+    bool operator()() const override {
+        return my_func();
+    }
+
+private:
+    FuncType &my_func;
+}; // class delegated_function
+
+// The concurrent monitor tags for concurrent_bounded_queue.
+static constexpr std::size_t cbq_slots_avail_tag = 0;
+static constexpr std::size_t cbq_items_avail_tag = 1;
+} // namespace d1
+
+
+namespace r1 {
+    class concurrent_monitor;
+
+    std::uint8_t* __TBB_EXPORTED_FUNC allocate_bounded_queue_rep( std::size_t queue_rep_size );
+    void __TBB_EXPORTED_FUNC deallocate_bounded_queue_rep( std::uint8_t* mem, std::size_t queue_rep_size );
+    void __TBB_EXPORTED_FUNC abort_bounded_queue_monitors( concurrent_monitor* monitors );
+    void __TBB_EXPORTED_FUNC notify_bounded_queue_monitor( concurrent_monitor* monitors, std::size_t monitor_tag
+                                                            , std::size_t ticket );
+    void __TBB_EXPORTED_FUNC wait_bounded_queue_monitor( concurrent_monitor* monitors, std::size_t monitor_tag,
+                                                            std::ptrdiff_t target, d1::delegate_base& predicate );
+} // namespace r1
+
+
+namespace d1 {
+// A high-performance thread-safe blocking concurrent bounded queue.
+// Supports boundedness and blocking semantics.
+// Multiple threads may each push and pop concurrently.
+// Assignment construction is not allowed.
+template <typename T, typename Allocator = tbb::cache_aligned_allocator<T>>
+class concurrent_bounded_queue {
+    using allocator_traits_type = tbb::detail::allocator_traits<Allocator>;
+    using queue_representation_type = concurrent_queue_rep<T, Allocator>;
+    using queue_allocator_type = typename allocator_traits_type::template rebind_alloc<queue_representation_type>;
+    using queue_allocator_traits = tbb::detail::allocator_traits<queue_allocator_type>;
+
+    template <typename FuncType>
+    void internal_wait(r1::concurrent_monitor* monitors, std::size_t monitor_tag, std::ptrdiff_t target, FuncType pred) {
+        delegated_function<FuncType> func(pred);
+        r1::wait_bounded_queue_monitor(monitors, monitor_tag, target, func);
+    }
+public:
+    using size_type = std::ptrdiff_t;
+    using value_type = T;
+    using reference = T&;
+    using const_reference = const T&;
+    using difference_type = std::ptrdiff_t;
+
+    using allocator_type = Allocator;
+    using pointer = typename allocator_traits_type::pointer;
+    using const_pointer = typename allocator_traits_type::const_pointer;
+
+    using iterator = concurrent_queue_iterator<concurrent_bounded_queue, T, Allocator>;
+    using const_iterator = concurrent_queue_iterator<concurrent_bounded_queue, const T, Allocator> ;
+
+    concurrent_bounded_queue() : concurrent_bounded_queue(allocator_type()) {}
+
+    explicit concurrent_bounded_queue( const allocator_type& a ) :
+        my_allocator(a), my_capacity(0), my_abort_counter(0), my_queue_representation(nullptr)
+    {
+        my_queue_representation = reinterpret_cast<queue_representation_type*>(
+            r1::allocate_bounded_queue_rep(sizeof(queue_representation_type)));
+        my_monitors = reinterpret_cast<r1::concurrent_monitor*>(my_queue_representation + 1);
+        queue_allocator_traits::construct(my_allocator, my_queue_representation, my_allocator);
+        my_capacity = std::size_t(-1) / (queue_representation_type::item_size > 1 ? queue_representation_type::item_size : 2);
+
+        __TBB_ASSERT(is_aligned(my_queue_representation, max_nfs_size), "alignment error" );
+        __TBB_ASSERT(is_aligned(&my_queue_representation->head_counter, max_nfs_size), "alignment error" );
+        __TBB_ASSERT(is_aligned(&my_queue_representation->tail_counter, max_nfs_size), "alignment error" );
+        __TBB_ASSERT(is_aligned(&my_queue_representation->array, max_nfs_size), "alignment error" );
+    }
+
+    template <typename InputIterator>
+    concurrent_bounded_queue( InputIterator begin, InputIterator end, const allocator_type& a = allocator_type() ) :
+        concurrent_bounded_queue(a)
+    {
+        for (; begin != end; ++begin)
+            push(*begin);
+    }
+
+    concurrent_bounded_queue( const concurrent_bounded_queue& src, const allocator_type& a ) :
+        concurrent_bounded_queue(a)
+    {
+        my_queue_representation->assign(*src.my_queue_representation, copy_construct_item);
+    }
+
+    concurrent_bounded_queue( const concurrent_bounded_queue& src ) :
+        concurrent_bounded_queue(queue_allocator_traits::select_on_container_copy_construction(src.get_allocator()))
+    {
+        my_queue_representation->assign(*src.my_queue_representation, copy_construct_item);
+    }
+
+    // Move constructors
+    concurrent_bounded_queue( concurrent_bounded_queue&& src ) :
+        concurrent_bounded_queue(std::move(src.my_allocator))
+    {
+        internal_swap(src);
+    }
+
+    concurrent_bounded_queue( concurrent_bounded_queue&& src, const allocator_type& a ) :
+        concurrent_bounded_queue(a)
+    {
+        // checking that memory allocated by one instance of allocator can be deallocated
+        // with another
+        if (my_allocator == src.my_allocator) {
+            internal_swap(src);
+        } else {
+            // allocators are different => performing per-element move
+            my_queue_representation->assign(*src.my_queue_representation, move_construct_item);
+            src.clear();
+        }
+    }
+
+    // Destroy queue
+    ~concurrent_bounded_queue() {
+        clear();
+        my_queue_representation->clear();
+        queue_allocator_traits::destroy(my_allocator, my_queue_representation);
+        r1::deallocate_bounded_queue_rep(reinterpret_cast<std::uint8_t*>(my_queue_representation),
+                                         sizeof(queue_representation_type));
+    }
+
+    // Enqueue an item at tail of queue.
+    void push( const T& value ) {
+        internal_push(value);
+    }
+
+    void push( T&& value ) {
+        internal_push(std::move(value));
+    }
+
+    // Enqueue an item at tail of queue if queue is not already full.
+    // Does not wait for queue to become not full.
+    // Returns true if item is pushed; false if queue was already full.
+    bool try_push( const T& value ) {
+        return internal_push_if_not_full(value);
+    }
+
+    bool try_push( T&& value ) {
+        return internal_push_if_not_full(std::move(value));
+    }
+
+    template <typename... Args>
+    void emplace( Args&&... args ) {
+        internal_push(std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    bool try_emplace( Args&&... args ) {
+        return internal_push_if_not_full(std::forward<Args>(args)...);
+    }
+
+    // Attempt to dequeue an item from head of queue.
+    /** Does not wait for item to become available.
+        Returns true if successful; false otherwise. */
+    bool pop( T& result ) {
+        return internal_pop(&result);
+    }
+
+    bool try_pop( T& result ) {
+        return internal_pop_if_present(&result);
+    }
+
+    void abort() {
+        internal_abort();
+    }
+
+    // Return the number of items in the queue; thread unsafe
+    std::ptrdiff_t size() const {
+        return my_queue_representation->size();
+    }
+
+    void set_capacity( size_type new_capacity ) {
+        std::ptrdiff_t c = new_capacity < 0 ? infinite_capacity : new_capacity;
+        my_capacity = c;
+    }
+
+    size_type capacity() const {
+        return my_capacity;
+    }
+
+    // Equivalent to size()==0.
+    __TBB_nodiscard bool empty() const {
+        return my_queue_representation->empty();
+    }
+
+    // Clear the queue. not thread-safe.
+    void clear() {
+        while (!empty()) {
+            T value;
+            try_pop(value);
+        }
+    }
+
+    // Return allocator object
+    allocator_type get_allocator() const { return my_allocator; }
+
+    //------------------------------------------------------------------------
+    // The iterators are intended only for debugging.  They are slow and not thread safe.
+    //------------------------------------------------------------------------
+
+    iterator unsafe_begin() { return concurrent_queue_iterator_provider::get<iterator>(*this); }
+    iterator unsafe_end() { return iterator(); }
+    const_iterator unsafe_begin() const { return concurrent_queue_iterator_provider::get<const_iterator>(*this); }
+    const_iterator unsafe_end() const { return const_iterator(); }
+    const_iterator unsafe_cbegin() const { return concurrent_queue_iterator_provider::get<const_iterator>(*this); }
+    const_iterator unsafe_cend() const { return const_iterator(); }
+
+private:
+    void internal_swap( concurrent_bounded_queue& src ) {
+        std::swap(my_queue_representation, src.my_queue_representation);
+        std::swap(my_monitors, src.my_monitors);
+    }
+
+    static constexpr std::ptrdiff_t infinite_capacity = std::ptrdiff_t(~size_type(0) / 2);
+
+    template <typename... Args>
+    void internal_push( Args&&... args ) {
+        unsigned old_abort_counter = my_abort_counter.load(std::memory_order_relaxed);
+        ticket_type ticket = my_queue_representation->tail_counter++;
+        std::ptrdiff_t target = ticket - my_capacity;
+
+        if (static_cast<std::ptrdiff_t>(my_queue_representation->head_counter.load(std::memory_order_relaxed)) <= target) { // queue is full
+            auto pred = [&] {
+                if (my_abort_counter.load(std::memory_order_relaxed) != old_abort_counter) {
+                    throw_exception(exception_id::user_abort);
+                }
+
+                return static_cast<std::ptrdiff_t>(my_queue_representation->head_counter.load(std::memory_order_relaxed)) <= target;
+            };
+
+            try_call( [&] {
+                internal_wait(my_monitors, cbq_slots_avail_tag, target, pred);
+            }).on_exception( [&] {
+                my_queue_representation->choose(ticket).abort_push(ticket, *my_queue_representation);
+            });
+
+        }
+        __TBB_ASSERT((static_cast<std::ptrdiff_t>(my_queue_representation->head_counter.load(std::memory_order_relaxed)) > target), nullptr);
+        my_queue_representation->choose(ticket).push(ticket, *my_queue_representation, std::forward<Args>(args)...);
+        r1::notify_bounded_queue_monitor(my_monitors, cbq_items_avail_tag, ticket);
+    }
+
+    template <typename... Args>
+    bool internal_push_if_not_full( Args&&... args ) {
+        ticket_type ticket = my_queue_representation->tail_counter.load(std::memory_order_relaxed);
+        do {
+            if (static_cast<std::ptrdiff_t>(ticket - my_queue_representation->head_counter.load(std::memory_order_relaxed)) >= my_capacity) {
+                // Queue is full
+                return false;
+            }
+            // Queue had empty slot with ticket k when we looked. Attempt to claim that slot.
+            // Another thread claimed the slot, so retry.
+        } while (!my_queue_representation->tail_counter.compare_exchange_strong(ticket, ticket + 1));
+
+        my_queue_representation->choose(ticket).push(ticket, *my_queue_representation, std::forward<Args>(args)...);
+        r1::notify_bounded_queue_monitor(my_monitors, cbq_items_avail_tag, ticket);
+        return true;
+    }
+
+    bool internal_pop( void* dst ) {
+        std::ptrdiff_t target;
+        // This loop is a single pop operation; abort_counter should not be re-read inside
+        unsigned old_abort_counter = my_abort_counter.load(std::memory_order_relaxed);
+
+        do {
+            target = my_queue_representation->head_counter++;
+            if (static_cast<std::ptrdiff_t>(my_queue_representation->tail_counter.load(std::memory_order_relaxed)) <= target) {
+                auto pred = [&] {
+                    if (my_abort_counter.load(std::memory_order_relaxed) != old_abort_counter) {
+                            throw_exception(exception_id::user_abort);
+                    }
+
+                    return static_cast<std::ptrdiff_t>(my_queue_representation->tail_counter.load(std::memory_order_relaxed)) <= target;
+                };
+
+                try_call( [&] {
+                    internal_wait(my_monitors, cbq_items_avail_tag, target, pred);
+                }).on_exception( [&] {
+                    my_queue_representation->head_counter--;
+                });
+            }
+            __TBB_ASSERT(static_cast<std::ptrdiff_t>(my_queue_representation->tail_counter.load(std::memory_order_relaxed)) > target, nullptr);
+        } while (!my_queue_representation->choose(target).pop(dst, target, *my_queue_representation));
+
+        r1::notify_bounded_queue_monitor(my_monitors, cbq_slots_avail_tag, target);
+        return true;
+    }
+
+    bool internal_pop_if_present( void* dst ) {
+        ticket_type ticket;
+        do {
+            ticket = my_queue_representation->head_counter.load(std::memory_order_relaxed);
+            do {
+                if (static_cast<std::ptrdiff_t>(my_queue_representation->tail_counter.load(std::memory_order_relaxed) - ticket) <= 0) { // queue is empty
+                    // Queue is empty
+                    return false;
+                }
+                // Queue had item with ticket k when we looked.  Attempt to get that item.
+                // Another thread snatched the item, retry.
+            } while (!my_queue_representation->head_counter.compare_exchange_strong(ticket, ticket + 1));
+        } while (!my_queue_representation->choose(ticket).pop(dst, ticket, *my_queue_representation));
+
+        r1::notify_bounded_queue_monitor(my_monitors, cbq_slots_avail_tag, ticket);
+        return true;
+    }
+
+    void internal_abort() {
+        ++my_abort_counter;
+        r1::abort_bounded_queue_monitors(my_monitors);
+    }
+
+    static void copy_construct_item(T* location, const void* src) {
+        // TODO: use allocator_traits for copy construction
+        new (location) value_type(*static_cast<const value_type*>(src));
+    }
+
+    static void move_construct_item(T* location, const void* src) {
+        // TODO: use allocator_traits for move construction
+        new (location) value_type(std::move(*static_cast<value_type*>(const_cast<void*>(src))));
+    }
+
+    template <typename Container, typename Value, typename A>
+    friend class concurrent_queue_iterator;
+
+    queue_allocator_type my_allocator;
+    std::ptrdiff_t my_capacity;
+    std::atomic<unsigned> my_abort_counter;
+    queue_representation_type* my_queue_representation;
+
+    r1::concurrent_monitor* my_monitors;
+}; // class concurrent_bounded_queue
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+// Deduction guide for the constructor from two iterators
+template <typename It, typename Alloc = tbb::cache_aligned_allocator<iterator_value_t<It>>>
+concurrent_bounded_queue( It, It, Alloc = Alloc() )
+-> concurrent_bounded_queue<iterator_value_t<It>, Alloc>;
+
+#endif /* __TBB_CPP17_DEDUCTION_GUIDES_PRESENT */
+
+} //namespace d1
+} // namesapce detail
+
+inline namespace v1 {
+
+using detail::d1::concurrent_queue;
+using detail::d1::concurrent_bounded_queue;
+using detail::r1::user_abort;
+using detail::r1::bad_last_alloc;
+
+} // inline namespace v1
+} // namespace tbb
+
+#endif // __TBB_concurrent_queue_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/concurrent_set.h b/contrib/libs/tbb/include/oneapi/tbb/concurrent_set.h
new file mode 100644
index 0000000000..c68fa6c362
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/concurrent_set.h
@@ -0,0 +1,259 @@
+/*
+    Copyright (c) 2019-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_set_H
+#define __TBB_concurrent_set_H
+
+#include "detail/_namespace_injection.h"
+#include "detail/_concurrent_skip_list.h"
+#include "tbb_allocator.h"
+#include <functional>
+#include <utility>
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template<typename Key, typename KeyCompare, typename RandomGenerator, typename Allocator, bool AllowMultimapping>
+struct set_traits {
+    static constexpr std::size_t max_level = RandomGenerator::max_level;
+    using random_level_generator_type = RandomGenerator;
+    using key_type = Key;
+    using value_type = key_type;
+    using compare_type = KeyCompare;
+    using value_compare = compare_type;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+    using allocator_type = Allocator;
+
+    static constexpr bool allow_multimapping = AllowMultimapping;
+
+    static const key_type& get_key(const_reference val) {
+        return val;
+    }
+
+    static value_compare value_comp(compare_type comp) { return comp; }
+}; // struct set_traits
+
+template <typename Key, typename Compare, typename Allocator>
+class concurrent_multiset;
+
+template <typename Key, typename Compare = std::less<Key>, typename Allocator = tbb::tbb_allocator<Key>>
+class concurrent_set : public concurrent_skip_list<set_traits<Key, Compare, concurrent_geometric_level_generator<32>, Allocator, false>> {
+    using base_type = concurrent_skip_list<set_traits<Key, Compare, concurrent_geometric_level_generator<32>, Allocator, false>>;
+public:
+    using key_type = Key;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using key_compare = Compare;
+    using value_compare = typename base_type::value_compare;
+    using allocator_type = Allocator;
+
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base_type
+    using base_type::base_type;
+    using base_type::operator=;
+
+    // Required for implicit deduction guides
+    concurrent_set() = default;
+    concurrent_set( const concurrent_set& ) = default;
+    concurrent_set( const concurrent_set& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_set( concurrent_set&& ) = default;
+    concurrent_set( concurrent_set&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_set& operator=( const concurrent_set& ) = default;
+    concurrent_set& operator=( concurrent_set&& ) = default;
+
+    template<typename OtherCompare>
+    void merge(concurrent_set<key_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_set<key_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multiset<key_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multiset<key_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_set
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename It,
+          typename Comp = std::less<iterator_value_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_value_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_set( It, It, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_set<iterator_value_t<It>, Comp, Alloc>;
+
+template <typename Key,
+          typename Comp = std::less<Key>,
+          typename Alloc = tbb::tbb_allocator<Key>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_set( std::initializer_list<Key>, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_set<Key, Comp, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_set( It, It, Alloc )
+-> concurrent_set<iterator_value_t<It>,
+                  std::less<iterator_value_t<It>>, Alloc>;
+
+template <typename Key, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_set( std::initializer_list<Key>, Alloc )
+-> concurrent_set<Key, std::less<Key>, Alloc>;
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename Compare, typename Allocator>
+void swap( concurrent_set<Key, Compare, Allocator>& lhs,
+           concurrent_set<Key, Compare, Allocator>& rhs )
+{
+    lhs.swap(rhs);
+}
+
+template <typename Key, typename Compare = std::less<Key>, typename Allocator = tbb::tbb_allocator<Key>>
+class concurrent_multiset : public concurrent_skip_list<set_traits<Key, Compare, concurrent_geometric_level_generator<32>, Allocator, true>> {
+    using base_type = concurrent_skip_list<set_traits<Key, Compare, concurrent_geometric_level_generator<32>, Allocator, true>>;
+public:
+    using key_type = Key;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using key_compare = Compare;
+    using value_compare = typename base_type::value_compare;
+    using allocator_type = Allocator;
+
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base_type;
+    using base_type::base_type;
+    using base_type::operator=;
+
+    // Required for implicit deduction guides
+    concurrent_multiset() = default;
+    concurrent_multiset( const concurrent_multiset& ) = default;
+    concurrent_multiset( const concurrent_multiset& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_multiset( concurrent_multiset&& ) = default;
+    concurrent_multiset( concurrent_multiset&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_multiset& operator=( const concurrent_multiset& ) = default;
+    concurrent_multiset& operator=( concurrent_multiset&& ) = default;
+
+    template<typename OtherCompare>
+    void merge(concurrent_set<key_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_set<key_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multiset<key_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multiset<key_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_multiset
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename It,
+          typename Comp = std::less<iterator_value_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_value_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_multiset( It, It, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_multiset<iterator_value_t<It>, Comp, Alloc>;
+
+template <typename Key,
+          typename Comp = std::less<Key>,
+          typename Alloc = tbb::tbb_allocator<Key>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_multiset( std::initializer_list<Key>, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_multiset<Key, Comp, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_multiset( It, It, Alloc )
+-> concurrent_multiset<iterator_value_t<It>, std::less<iterator_value_t<It>>, Alloc>;
+
+template <typename Key, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_multiset( std::initializer_list<Key>, Alloc )
+-> concurrent_multiset<Key, std::less<Key>, Alloc>;
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename Compare, typename Allocator>
+void swap( concurrent_multiset<Key, Compare, Allocator>& lhs,
+           concurrent_multiset<Key, Compare, Allocator>& rhs )
+{
+    lhs.swap(rhs);
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+
+using detail::d1::concurrent_set;
+using detail::d1::concurrent_multiset;
+using detail::split;
+
+} // inline namespace v1
+} // namespace tbb
+
+#endif // __TBB_concurrent_set_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/concurrent_unordered_map.h b/contrib/libs/tbb/include/oneapi/tbb/concurrent_unordered_map.h
new file mode 100644
index 0000000000..0c9c2cd79c
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/concurrent_unordered_map.h
@@ -0,0 +1,387 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_unordered_map_H
+#define __TBB_concurrent_unordered_map_H
+
+#include "detail/_namespace_injection.h"
+#include "detail/_concurrent_unordered_base.h"
+#include "tbb_allocator.h"
+#include <functional>
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename Key, typename T, typename Hash, typename KeyEqual, typename Allocator, bool AllowMultimapping>
+struct concurrent_unordered_map_traits {
+    using value_type = std::pair<const Key, T>;
+    using key_type = Key;
+    using allocator_type = Allocator;
+    using hash_compare_type = hash_compare<Key, Hash, KeyEqual>;
+    static constexpr bool allow_multimapping = AllowMultimapping;
+
+    static constexpr const key_type& get_key( const value_type& value ) {
+        return value.first;
+    }
+}; // struct concurrent_unordered_map_traits
+
+template <typename Key, typename T, typename Hash, typename KeyEqual, typename Allocator>
+class concurrent_unordered_multimap;
+
+template <typename Key, typename T, typename Hash = std::hash<Key>, typename KeyEqual = std::equal_to<Key>,
+          typename Allocator = tbb::tbb_allocator<std::pair<const Key, T>> >
+class concurrent_unordered_map
+    : public concurrent_unordered_base<concurrent_unordered_map_traits<Key, T, Hash, KeyEqual, Allocator, false>>
+{
+    using traits_type = concurrent_unordered_map_traits<Key, T, Hash, KeyEqual, Allocator, false>;
+    using base_type = concurrent_unordered_base<traits_type>;
+public:
+    using key_type = typename base_type::key_type;
+    using mapped_type = T;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using hasher = typename base_type::hasher;
+    using key_equal = typename base_type::key_equal;
+    using allocator_type = typename base_type::allocator_type;
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+    using local_iterator = typename base_type::local_iterator;
+    using const_local_iterator = typename base_type::const_local_iterator;
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base type
+    using base_type::base_type;
+    using base_type::operator=;
+
+    // Required for implicit deduction guides
+    concurrent_unordered_map() = default;
+    concurrent_unordered_map( const concurrent_unordered_map& ) = default;
+    concurrent_unordered_map( const concurrent_unordered_map& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_unordered_map( concurrent_unordered_map&& ) = default;
+    concurrent_unordered_map( concurrent_unordered_map&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_unordered_map& operator=( const concurrent_unordered_map& ) = default;
+    concurrent_unordered_map& operator=( concurrent_unordered_map&& ) = default;
+
+    // Observers
+    mapped_type& operator[]( const key_type& key ) {
+        iterator where = this->find(key);
+
+        if (where == this->end()) {
+            where = this->emplace(std::piecewise_construct, std::forward_as_tuple(key), std::tuple<>()).first;
+        }
+        return where->second;
+    }
+
+    mapped_type& operator[]( key_type&& key ) {
+        iterator where = this->find(key);
+
+        if (where == this->end()) {
+            where = this->emplace(std::piecewise_construct, std::forward_as_tuple(std::move(key)), std::tuple<>()).first;
+        }
+        return where->second;
+    }
+
+    mapped_type& at( const key_type& key ) {
+        iterator where = this->find(key);
+
+        if (where == this->end()) {
+            throw_exception(exception_id::invalid_key);
+        }
+        return where->second;
+    }
+
+    const mapped_type& at( const key_type& key ) const {
+        const_iterator where = this->find(key);
+
+        if (where == this->end()) {
+            throw_exception(exception_id::out_of_range);
+        }
+        return where->second;
+    }
+
+    using base_type::insert;
+
+    template<typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            std::pair<iterator, bool>>::type insert( P&& value ) {
+        return this->emplace(std::forward<P>(value));
+    }
+
+    template<typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            iterator>::type insert( const_iterator hint, P&& value ) {
+        return this->emplace_hint(hint, std::forward<P>(value));
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_map<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_map<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multimap<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multimap<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_unordered_map
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+template <typename It,
+          typename Hash = std::hash<iterator_key_t<It>>,
+          typename KeyEq = std::equal_to<iterator_key_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_alloc_pair_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_map( It, It, std::size_t =  {},
+                          Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_map<iterator_key_t<It>, iterator_mapped_t<It>, Hash, KeyEq, Alloc>;
+
+template <typename Key, typename T,
+          typename Hash = std::hash<std::remove_const_t<Key>>,
+          typename KeyEq = std::equal_to<std::remove_const_t<Key>>,
+          typename Alloc = tbb::tbb_allocator<std::pair<const Key, T>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_map( std::initializer_list<std::pair<Key, T>>, std::size_t = {},
+                          Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_map<std::remove_const_t<Key>, T, Hash, KeyEq, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_map( It, It, std::size_t, Alloc )
+-> concurrent_unordered_map<iterator_key_t<It>, iterator_mapped_t<It>,
+                            std::hash<iterator_key_t<It>>,
+                            std::equal_to<iterator_key_t<It>>, Alloc>;
+
+// TODO: investigate if a deduction guide for concurrent_unordered_map(It, It, Alloc) is needed
+
+template <typename It, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_map( It, It, std::size_t, Hash, Alloc )
+-> concurrent_unordered_map<iterator_key_t<It>, iterator_mapped_t<It>,
+                            Hash, std::equal_to<iterator_key_t<It>>, Alloc>;
+
+template <typename Key, typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_map( std::initializer_list<std::pair<Key, T>>, std::size_t, Alloc )
+-> concurrent_unordered_map<std::remove_const_t<Key>, T, std::hash<std::remove_const_t<Key>>,
+                            std::equal_to<std::remove_const_t<Key>>, Alloc>;
+
+template <typename Key, typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_map( std::initializer_list<std::pair<Key, T>>, Alloc )
+-> concurrent_unordered_map<std::remove_const_t<Key>, T, std::hash<std::remove_const_t<Key>>,
+                            std::equal_to<std::remove_const_t<Key>>, Alloc>;
+
+template <typename Key, typename T, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_map( std::initializer_list<std::pair<Key, T>>, std::size_t, Hash, Alloc )
+-> concurrent_unordered_map<std::remove_const_t<Key>, T, Hash,
+                            std::equal_to<std::remove_const_t<Key>>, Alloc>;
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename T, typename Hash, typename KeyEqual, typename Allocator>
+void swap( concurrent_unordered_map<Key, T, Hash, KeyEqual, Allocator>& lhs,
+           concurrent_unordered_map<Key, T, Hash, KeyEqual, Allocator>& rhs ) {
+    lhs.swap(rhs);
+}
+
+template <typename Key, typename T, typename Hash = std::hash<Key>, typename KeyEqual = std::equal_to<Key>,
+          typename Allocator = tbb::tbb_allocator<std::pair<const Key, T>> >
+class concurrent_unordered_multimap
+    : public concurrent_unordered_base<concurrent_unordered_map_traits<Key, T, Hash, KeyEqual, Allocator, true>>
+{
+    using traits_type = concurrent_unordered_map_traits<Key, T, Hash, KeyEqual, Allocator, true>;
+    using base_type = concurrent_unordered_base<traits_type>;
+public:
+    using key_type = typename base_type::key_type;
+    using mapped_type = T;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using hasher = typename base_type::hasher;
+    using key_equal = typename base_type::key_equal;
+    using allocator_type = typename base_type::allocator_type;
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+    using local_iterator = typename base_type::local_iterator;
+    using const_local_iterator = typename base_type::const_local_iterator;
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base type
+    using base_type::base_type;
+    using base_type::operator=;
+    using base_type::insert;
+
+    // Required for implicit deduction guides
+    concurrent_unordered_multimap() = default;
+    concurrent_unordered_multimap( const concurrent_unordered_multimap& ) = default;
+    concurrent_unordered_multimap( const concurrent_unordered_multimap& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_unordered_multimap( concurrent_unordered_multimap&& ) = default;
+    concurrent_unordered_multimap( concurrent_unordered_multimap&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_unordered_multimap& operator=( const concurrent_unordered_multimap& ) = default;
+    concurrent_unordered_multimap& operator=( concurrent_unordered_multimap&& ) = default;
+
+    template <typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            std::pair<iterator, bool>>::type insert( P&& value ) {
+        return this->emplace(std::forward<P>(value));
+    }
+
+    template<typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            iterator>::type insert( const_iterator hint, P&& value ) {
+        return this->emplace_hint(hint, std::forward<P&&>(value));
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_map<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_map<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multimap<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multimap<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_unordered_multimap
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename It,
+          typename Hash = std::hash<iterator_key_t<It>>,
+          typename KeyEq = std::equal_to<iterator_key_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_alloc_pair_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multimap( It, It, std::size_t = {}, Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_multimap<iterator_key_t<It>, iterator_mapped_t<It>, Hash, KeyEq, Alloc>;
+
+template <typename Key, typename T,
+          typename Hash = std::hash<std::remove_const_t<Key>>,
+          typename KeyEq = std::equal_to<std::remove_const_t<Key>>,
+          typename Alloc = tbb::tbb_allocator<std::pair<const Key, T>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multimap( std::initializer_list<std::pair<Key, T>>, std::size_t = {},
+                               Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_multimap<std::remove_const_t<Key>, T, Hash, KeyEq, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_multimap( It, It, std::size_t, Alloc )
+-> concurrent_unordered_multimap<iterator_key_t<It>, iterator_mapped_t<It>,
+                                 std::hash<iterator_key_t<It>>,
+                                 std::equal_to<iterator_key_t<It>>, Alloc>;
+
+template <typename It, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multimap( It, It, std::size_t, Hash, Alloc )
+-> concurrent_unordered_multimap<iterator_key_t<It>, iterator_mapped_t<It>, Hash,
+                                 std::equal_to<iterator_key_t<It>>, Alloc>;
+
+template <typename Key, typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_multimap( std::initializer_list<std::pair<Key, T>>, std::size_t, Alloc )
+-> concurrent_unordered_multimap<std::remove_const_t<Key>, T, std::hash<std::remove_const_t<Key>>,
+                                 std::equal_to<std::remove_const_t<Key>>, Alloc>;
+
+template <typename Key, typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_multimap( std::initializer_list<std::pair<Key, T>>, Alloc )
+-> concurrent_unordered_multimap<std::remove_const_t<Key>, T, std::hash<std::remove_const_t<Key>>,
+                                 std::equal_to<std::remove_const_t<Key>>, Alloc>;
+
+template <typename Key, typename T, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multimap( std::initializer_list<std::pair<Key, T>>, std::size_t, Hash, Alloc )
+-> concurrent_unordered_multimap<std::remove_const_t<Key>, T, Hash,
+                                 std::equal_to<std::remove_const_t<Key>>, Alloc>;
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename T, typename Hash, typename KeyEqual, typename Allocator>
+void swap( concurrent_unordered_multimap<Key, T, Hash, KeyEqual, Allocator>& lhs,
+           concurrent_unordered_multimap<Key, T, Hash, KeyEqual, Allocator>& rhs ) {
+    lhs.swap(rhs);
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+
+using detail::d1::concurrent_unordered_map;
+using detail::d1::concurrent_unordered_multimap;
+using detail::split;
+
+} // inline namespace v1
+} // namespace tbb
+
+#endif // __TBB_concurrent_unordered_map_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/concurrent_unordered_set.h b/contrib/libs/tbb/include/oneapi/tbb/concurrent_unordered_set.h
new file mode 100644
index 0000000000..ce6175294d
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/concurrent_unordered_set.h
@@ -0,0 +1,306 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_unordered_set_H
+#define __TBB_concurrent_unordered_set_H
+
+#include "detail/_namespace_injection.h"
+#include "detail/_concurrent_unordered_base.h"
+#include "tbb_allocator.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename Key, typename Hash, typename KeyEqual, typename Allocator, bool AllowMultimapping>
+struct concurrent_unordered_set_traits {
+    using key_type = Key;
+    using value_type = key_type;
+    using allocator_type = Allocator;
+    using hash_compare_type = hash_compare<key_type, Hash, KeyEqual>;
+    static constexpr bool allow_multimapping = AllowMultimapping;
+
+    static constexpr const key_type& get_key( const value_type& value ) {
+        return value;
+    }
+}; // class concurrent_unordered_set_traits
+
+template <typename Key, typename Hash, typename KeyEqual, typename Allocator>
+class concurrent_unordered_multiset;
+
+template <typename Key, typename Hash = std::hash<Key>, typename KeyEqual = std::equal_to<Key>,
+          typename Allocator = tbb::tbb_allocator<Key>>
+class concurrent_unordered_set
+    : public concurrent_unordered_base<concurrent_unordered_set_traits<Key, Hash, KeyEqual, Allocator, false>>
+{
+    using traits_type = concurrent_unordered_set_traits<Key, Hash, KeyEqual, Allocator, false>;
+    using base_type = concurrent_unordered_base<traits_type>;
+public:
+    using key_type = typename base_type::key_type;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using hasher = typename base_type::hasher;
+    using key_equal = typename base_type::key_equal;
+    using allocator_type = typename base_type::allocator_type;
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+    using local_iterator = typename base_type::local_iterator;
+    using const_local_iterator = typename base_type::const_local_iterator;
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base_type;
+    using base_type::base_type;
+    using base_type::operator=;
+    // Required for implicit deduction guides
+    concurrent_unordered_set() = default;
+    concurrent_unordered_set( const concurrent_unordered_set& ) = default;
+    concurrent_unordered_set( const concurrent_unordered_set& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_unordered_set( concurrent_unordered_set&& ) = default;
+    concurrent_unordered_set( concurrent_unordered_set&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_unordered_set& operator=( const concurrent_unordered_set& ) = default;
+    concurrent_unordered_set& operator=( concurrent_unordered_set&& ) = default;
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_set<key_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_set<key_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multiset<key_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multiset<key_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_unordered_set
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename It,
+          typename Hash = std::hash<iterator_value_t<It>>,
+          typename KeyEq = std::equal_to<iterator_value_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_value_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_set( It, It, std::size_t = {}, Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_set<iterator_value_t<It>, Hash, KeyEq, Alloc>;
+
+template <typename T,
+          typename Hash = std::hash<T>,
+          typename KeyEq = std::equal_to<T>,
+          typename Alloc = tbb::tbb_allocator<T>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_set( std::initializer_list<T>, std::size_t = {},
+                          Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_set<T, Hash, KeyEq, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_set( It, It, std::size_t, Alloc )
+-> concurrent_unordered_set<iterator_value_t<It>, std::hash<iterator_value_t<It>>,
+                            std::equal_to<iterator_value_t<It>>, Alloc>;
+
+template <typename It, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_set( It, It, std::size_t, Hash, Alloc )
+-> concurrent_unordered_set<iterator_value_t<It>, Hash, std::equal_to<iterator_value_t<It>>, Alloc>;
+
+template <typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_set( std::initializer_list<T>, std::size_t, Alloc )
+-> concurrent_unordered_set<T, std::hash<T>, std::equal_to<T>, Alloc>;
+
+template <typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_set( std::initializer_list<T>, Alloc )
+-> concurrent_unordered_set<T, std::hash<T>, std::equal_to<T>, Alloc>;
+
+template <typename T, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_set( std::initializer_list<T>, std::size_t, Hash, Alloc )
+-> concurrent_unordered_set<T, Hash, std::equal_to<T>, Alloc>;
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename Hash, typename KeyEqual, typename Allocator>
+void swap( concurrent_unordered_set<Key, Hash, KeyEqual, Allocator>& lhs,
+           concurrent_unordered_set<Key, Hash, KeyEqual, Allocator>& rhs ) {
+    lhs.swap(rhs);
+}
+
+template <typename Key, typename Hash = std::hash<Key>, typename KeyEqual = std::equal_to<Key>,
+          typename Allocator = tbb::tbb_allocator<Key>>
+class concurrent_unordered_multiset
+    : public concurrent_unordered_base<concurrent_unordered_set_traits<Key, Hash, KeyEqual, Allocator, true>>
+{
+    using traits_type = concurrent_unordered_set_traits<Key, Hash, KeyEqual, Allocator, true>;
+    using base_type = concurrent_unordered_base<traits_type>;
+public:
+    using key_type = typename base_type::key_type;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using hasher = typename base_type::hasher;
+    using key_equal = typename base_type::key_equal;
+    using allocator_type = typename base_type::allocator_type;
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+    using local_iterator = typename base_type::local_iterator;
+    using const_local_iterator = typename base_type::const_local_iterator;
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base_type;
+    using base_type::base_type;
+    using base_type::operator=;
+
+    // Required for implicit deduction guides
+    concurrent_unordered_multiset() = default;
+    concurrent_unordered_multiset( const concurrent_unordered_multiset& ) = default;
+    concurrent_unordered_multiset( const concurrent_unordered_multiset& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_unordered_multiset( concurrent_unordered_multiset&& ) = default;
+    concurrent_unordered_multiset( concurrent_unordered_multiset&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_unordered_multiset& operator=( const concurrent_unordered_multiset& ) = default;
+    concurrent_unordered_multiset& operator=( concurrent_unordered_multiset&& ) = default;
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_set<key_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_set<key_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multiset<key_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multiset<key_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_unordered_multiset
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+template <typename It,
+          typename Hash = std::hash<iterator_value_t<It>>,
+          typename KeyEq = std::equal_to<iterator_value_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_value_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multiset( It, It, std::size_t = {}, Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_multiset<iterator_value_t<It>, Hash, KeyEq, Alloc>;
+
+template <typename T,
+          typename Hash = std::hash<T>,
+          typename KeyEq = std::equal_to<T>,
+          typename Alloc = tbb::tbb_allocator<T>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multiset( std::initializer_list<T>, std::size_t = {},
+                          Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_multiset<T, Hash, KeyEq, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_multiset( It, It, std::size_t, Alloc )
+-> concurrent_unordered_multiset<iterator_value_t<It>, std::hash<iterator_value_t<It>>,
+                            std::equal_to<iterator_value_t<It>>, Alloc>;
+
+template <typename It, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multiset( It, It, std::size_t, Hash, Alloc )
+-> concurrent_unordered_multiset<iterator_value_t<It>, Hash, std::equal_to<iterator_value_t<It>>, Alloc>;
+
+template <typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_multiset( std::initializer_list<T>, std::size_t, Alloc )
+-> concurrent_unordered_multiset<T, std::hash<T>, std::equal_to<T>, Alloc>;
+
+template <typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_multiset( std::initializer_list<T>, Alloc )
+-> concurrent_unordered_multiset<T, std::hash<T>, std::equal_to<T>, Alloc>;
+
+template <typename T, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multiset( std::initializer_list<T>, std::size_t, Hash, Alloc )
+-> concurrent_unordered_multiset<T, Hash, std::equal_to<T>, Alloc>;
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename Hash, typename KeyEqual, typename Allocator>
+void swap( concurrent_unordered_multiset<Key, Hash, KeyEqual, Allocator>& lhs,
+           concurrent_unordered_multiset<Key, Hash, KeyEqual, Allocator>& rhs ) {
+    lhs.swap(rhs);
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+
+using detail::d1::concurrent_unordered_set;
+using detail::d1::concurrent_unordered_multiset;
+using detail::split;
+
+} // inline namespace v1
+} // namespace tbb
+
+#endif // __TBB_concurrent_unordered_set_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/concurrent_vector.h b/contrib/libs/tbb/include/oneapi/tbb/concurrent_vector.h
new file mode 100644
index 0000000000..94a22b92c6
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/concurrent_vector.h
@@ -0,0 +1,1114 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_vector_H
+#define __TBB_concurrent_vector_H
+
+#include "detail/_namespace_injection.h"
+#include "detail/_utils.h"
+#include "detail/_assert.h"
+#include "detail/_allocator_traits.h"
+#include "detail/_segment_table.h"
+#include "detail/_containers_helpers.h"
+#include "blocked_range.h"
+#include "cache_aligned_allocator.h"
+
+#include <algorithm>
+#include <utility> // std::move_if_noexcept
+#include <algorithm>
+#if __TBB_CPP20_COMPARISONS_PRESENT
+#include <compare>
+#endif
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename Vector, typename Value>
+class vector_iterator {
+    using vector_type = Vector;
+
+public:
+    using value_type = Value;
+    using size_type = typename vector_type::size_type;
+    using difference_type = typename vector_type::difference_type;
+    using pointer = value_type*;
+    using reference = value_type&;
+    using iterator_category = std::random_access_iterator_tag;
+
+    template <typename Vec, typename Val>
+    friend vector_iterator<Vec, Val> operator+( typename vector_iterator<Vec, Val>::difference_type, const vector_iterator<Vec, Val>& );
+
+    template <typename Vec, typename Val1, typename Val2>
+    friend typename vector_iterator<Vec, Val1>::difference_type operator-( const vector_iterator<Vec, Val1>&, const vector_iterator<Vec, Val2>& );
+
+    template <typename Vec, typename Val1, typename Val2>
+    friend bool operator==( const vector_iterator<Vec, Val1>&, const vector_iterator<Vec, Val2>& );
+
+    template <typename Vec, typename Val1, typename Val2>
+    friend bool operator<( const vector_iterator<Vec, Val1>&, const vector_iterator<Vec, Val2>& );
+
+    template <typename Vec, typename Val>
+    friend class vector_iterator;
+
+    template <typename T, typename Allocator>
+    friend class concurrent_vector;
+
+private:
+    vector_iterator( const vector_type& vector, size_type index, value_type* item = nullptr )
+        : my_vector(const_cast<vector_type*>(&vector)), my_index(index), my_item(item)
+    {}
+
+public:
+    vector_iterator() : my_vector(nullptr), my_index(~size_type(0)), my_item(nullptr)
+    {}
+
+    vector_iterator( const vector_iterator<vector_type, typename vector_type::value_type>& other )
+        : my_vector(other.my_vector), my_index(other.my_index), my_item(other.my_item)
+    {}
+
+    vector_iterator& operator=( const vector_iterator<vector_type, typename vector_type::value_type>& other ) {
+        my_vector = other.my_vector;
+        my_index = other.my_index;
+        my_item = other.my_item;
+        return *this;
+    }
+
+    vector_iterator operator+( difference_type offset ) const {
+        return vector_iterator(*my_vector, my_index + offset);
+    }
+
+    vector_iterator& operator+=( difference_type offset ) {
+        my_index += offset;
+        my_item = nullptr;
+        return *this;
+    }
+
+    vector_iterator operator-( difference_type offset ) const {
+        return vector_iterator(*my_vector, my_index - offset);
+    }
+
+    vector_iterator& operator-=( difference_type offset ) {
+        my_index -= offset;
+        my_item = nullptr;
+        return *this;
+    }
+
+    reference operator*() const {
+        value_type *item = my_item;
+        if (item == nullptr) {
+            item = &my_vector->internal_subscript(my_index);
+        } else {
+            __TBB_ASSERT(item == &my_vector->internal_subscript(my_index), "corrupt cache");
+        }
+        return *item;
+    }
+
+    pointer operator->() const { return &(operator*()); }
+
+    reference operator[]( difference_type k ) const {
+        return my_vector->internal_subscript(my_index + k);
+    }
+
+    vector_iterator& operator++() {
+        ++my_index;
+        if (my_item != nullptr) {
+            if (vector_type::is_first_element_in_segment(my_index)) {
+                // If the iterator crosses a segment boundary, the pointer become invalid
+                // as possibly next segment is in another memory location
+                my_item = nullptr;
+            } else {
+                ++my_item;
+            }
+        }
+        return *this;
+    }
+
+    vector_iterator operator++(int) {
+        vector_iterator result = *this;
+        ++(*this);
+        return result;
+    }
+
+    vector_iterator& operator--() {
+        __TBB_ASSERT(my_index > 0, "operator--() applied to iterator already at beginning of concurrent_vector");
+        --my_index;
+        if (my_item != nullptr) {
+            if (vector_type::is_first_element_in_segment(my_index)) {
+                // If the iterator crosses a segment boundary, the pointer become invalid
+                // as possibly next segment is in another memory location
+                my_item = nullptr;
+            } else {
+                --my_item;
+            }
+        }
+        return *this;
+    }
+
+    vector_iterator operator--(int) {
+        vector_iterator result = *this;
+        --(*this);
+        return result;
+    }
+
+private:
+    // concurrent_vector over which we are iterating.
+    vector_type* my_vector;
+
+    // Index into the vector
+    size_type my_index;
+
+    // Caches my_vector *it;
+    // If my_item == nullptr cached value is not available use internal_subscript(my_index)
+    mutable value_type* my_item;
+}; // class vector_iterator
+
+template <typename Vector, typename T>
+vector_iterator<Vector, T> operator+( typename vector_iterator<Vector, T>::difference_type offset,
+                                      const vector_iterator<Vector, T>& v )
+{
+    return vector_iterator<Vector, T>(*v.my_vector, v.my_index + offset);
+}
+
+template <typename Vector, typename T, typename U>
+typename vector_iterator<Vector, T>::difference_type operator-( const vector_iterator<Vector, T>& i,
+                                                                const vector_iterator<Vector, U>& j )
+{
+    using difference_type = typename vector_iterator<Vector, T>::difference_type;
+    return static_cast<difference_type>(i.my_index) - static_cast<difference_type>(j.my_index);
+}
+
+template <typename Vector, typename T, typename U>
+bool operator==( const vector_iterator<Vector, T>& i, const vector_iterator<Vector, U>& j ) {
+    return i.my_vector == j.my_vector && i.my_index == j.my_index;
+}
+
+template <typename Vector, typename T, typename U>
+bool operator!=( const vector_iterator<Vector, T>& i, const vector_iterator<Vector, U>& j ) {
+    return !(i == j);
+}
+
+template <typename Vector, typename T, typename U>
+bool operator<( const vector_iterator<Vector, T>& i, const vector_iterator<Vector, U>& j ) {
+    return i.my_index < j.my_index;
+}
+
+template <typename Vector, typename T, typename U>
+bool operator>( const vector_iterator<Vector, T>& i, const vector_iterator<Vector, U>& j ) {
+    return j < i;
+}
+
+template <typename Vector, typename T, typename U>
+bool operator>=( const vector_iterator<Vector, T>& i, const vector_iterator<Vector, U>& j ) {
+    return !(i < j);
+}
+
+template <typename Vector, typename T, typename U>
+bool operator<=( const vector_iterator<Vector, T>& i, const vector_iterator<Vector, U>& j ) {
+    return !(j < i);
+}
+
+static constexpr std::size_t embedded_table_num_segments = 3;
+
+template <typename T, typename Allocator = tbb::cache_aligned_allocator<T>>
+class concurrent_vector
+    : private segment_table<T, Allocator, concurrent_vector<T, Allocator>, embedded_table_num_segments>
+{
+    using self_type = concurrent_vector<T, Allocator>;
+    using base_type = segment_table<T, Allocator, self_type, embedded_table_num_segments>;
+
+    friend class segment_table<T, Allocator, self_type, embedded_table_num_segments>;
+
+    template <typename Iterator>
+    class generic_range_type : public tbb::blocked_range<Iterator> {
+        using base_type = tbb::blocked_range<Iterator>;
+    public:
+        using value_type = T;
+        using reference = T&;
+        using const_reference = const T&;
+        using iterator = Iterator;
+        using difference_type = std::ptrdiff_t;
+
+        using base_type::base_type;
+
+        template<typename U>
+        generic_range_type( const generic_range_type<U>& r) : blocked_range<Iterator>(r.begin(), r.end(), r.grainsize()) {}
+        generic_range_type( generic_range_type& r, split ) : blocked_range<Iterator>(r, split()) {}
+    }; // class generic_range_type
+
+    static_assert(std::is_same<T, typename Allocator::value_type>::value,
+                  "value_type of the container must be the same as its allocator's");
+    using allocator_traits_type = tbb::detail::allocator_traits<Allocator>;
+    // Segment table for concurrent_vector can be extended
+    static constexpr bool allow_table_extending = true;
+    static constexpr bool is_noexcept_assignment = allocator_traits_type::propagate_on_container_move_assignment::value ||
+                                                   allocator_traits_type::is_always_equal::value;
+    static constexpr bool is_noexcept_swap = allocator_traits_type::propagate_on_container_swap::value ||
+                                             allocator_traits_type::is_always_equal::value;
+
+public:
+    using value_type = T;
+    using allocator_type = Allocator;
+    using size_type = std::size_t;
+    using difference_type = std::ptrdiff_t;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+
+    using pointer = typename allocator_traits_type::pointer;
+    using const_pointer = typename allocator_traits_type::const_pointer;
+
+    using iterator = vector_iterator<concurrent_vector, value_type>;
+    using const_iterator = vector_iterator<concurrent_vector, const value_type>;
+    using reverse_iterator = std::reverse_iterator<iterator>;
+    using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+    using range_type = generic_range_type<iterator>;
+    using const_range_type = generic_range_type<const_iterator>;
+
+    concurrent_vector() : concurrent_vector(allocator_type()) {}
+
+    explicit concurrent_vector( const allocator_type& alloc ) noexcept
+        : base_type(alloc)
+    {}
+
+    explicit concurrent_vector( size_type count, const value_type& value,
+                                const allocator_type& alloc = allocator_type() )
+        : concurrent_vector(alloc)
+    {
+        try_call( [&] {
+            grow_by(count, value);
+        } ).on_exception( [&] {
+            base_type::clear();
+        });
+    }
+
+    explicit concurrent_vector( size_type count, const allocator_type& alloc = allocator_type() )
+        : concurrent_vector(alloc)
+    {
+        try_call( [&] {
+            grow_by(count);
+        } ).on_exception( [&] {
+            base_type::clear();
+        });
+    }
+
+    template <typename InputIterator>
+    concurrent_vector( InputIterator first, InputIterator last, const allocator_type& alloc = allocator_type() )
+        : concurrent_vector(alloc)
+    {
+        try_call( [&] {
+            grow_by(first, last);
+        } ).on_exception( [&] {
+            base_type::clear();
+        });
+    }
+
+    concurrent_vector( const concurrent_vector& other )
+        : base_type(segment_table_allocator_traits::select_on_container_copy_construction(other.get_allocator()))
+    {
+        try_call( [&] {
+            grow_by(other.begin(), other.end());
+        } ).on_exception( [&] {
+            base_type::clear();
+        });
+    }
+
+    concurrent_vector( const concurrent_vector& other, const allocator_type& alloc )
+        : base_type(other, alloc) {}
+
+    concurrent_vector(concurrent_vector&& other) noexcept
+        : base_type(std::move(other))
+    {}
+
+    concurrent_vector( concurrent_vector&& other, const allocator_type& alloc )
+        : base_type(std::move(other), alloc)
+    {}
+
+    concurrent_vector( std::initializer_list<value_type> init,
+                       const allocator_type& alloc = allocator_type() )
+        : concurrent_vector(init.begin(), init.end(), alloc)
+    {}
+
+    ~concurrent_vector() {}
+
+    // Assignment
+    concurrent_vector& operator=( const concurrent_vector& other ) {
+        base_type::operator=(other);
+        return *this;
+    }
+
+    concurrent_vector& operator=( concurrent_vector&& other ) noexcept(is_noexcept_assignment) {
+        base_type::operator=(std::move(other));
+        return *this;
+    }
+
+    concurrent_vector& operator=( std::initializer_list<value_type> init ) {
+        assign(init);
+        return *this;
+    }
+
+    void assign( size_type count, const value_type& value ) {
+        destroy_elements();
+        grow_by(count, value);
+    }
+
+    template <typename InputIterator>
+    typename std::enable_if<is_input_iterator<InputIterator>::value, void>::type
+    assign( InputIterator first, InputIterator last ) {
+        destroy_elements();
+        grow_by(first, last);
+    }
+
+    void assign( std::initializer_list<value_type> init ) {
+        destroy_elements();
+        assign(init.begin(), init.end());
+    }
+
+    // Concurrent growth
+    iterator grow_by( size_type delta ) {
+        return internal_grow_by_delta(delta);
+    }
+
+    iterator grow_by( size_type delta, const value_type& value ) {
+        return internal_grow_by_delta(delta, value);
+    }
+
+    template <typename ForwardIterator>
+    typename std::enable_if<is_input_iterator<ForwardIterator>::value, iterator>::type
+    grow_by( ForwardIterator first, ForwardIterator last ) {
+        auto delta = std::distance(first, last);
+        return internal_grow_by_delta(delta, first, last);
+    }
+
+    iterator grow_by( std::initializer_list<value_type> init ) {
+        return grow_by(init.begin(), init.end());
+    }
+
+    iterator grow_to_at_least( size_type n ) {
+        return internal_grow_to_at_least(n);
+    }
+    iterator grow_to_at_least( size_type n, const value_type& value ) {
+        return internal_grow_to_at_least(n, value);
+    }
+
+    iterator push_back( const value_type& item ) {
+        return internal_emplace_back(item);
+    }
+
+    iterator push_back( value_type&& item ) {
+        return internal_emplace_back(std::move(item));
+    }
+
+    template <typename... Args>
+    iterator emplace_back( Args&&... args ) {
+        return internal_emplace_back(std::forward<Args>(args)...);
+    }
+
+    // Items access
+    reference operator[]( size_type index ) {
+        return internal_subscript(index);
+    }
+    const_reference operator[]( size_type index ) const {
+        return internal_subscript(index);
+    }
+
+    reference at( size_type index ) {
+        return internal_subscript_with_exceptions(index);
+    }
+    const_reference at( size_type index ) const {
+        return internal_subscript_with_exceptions(index);
+    }
+
+    // Get range for iterating with parallel algorithms
+    range_type range( size_t grainsize = 1 ) {
+        return range_type(begin(), end(), grainsize);
+    }
+
+    // Get const range for iterating with parallel algorithms
+    const_range_type range( size_t grainsize = 1 ) const {
+        return const_range_type(begin(), end(), grainsize);
+    }
+
+    reference front() {
+        return internal_subscript(0);
+    }
+
+    const_reference front() const {
+        return internal_subscript(0);
+    }
+
+    reference back() {
+        return internal_subscript(size() - 1);
+    }
+
+    const_reference back() const {
+        return internal_subscript(size() - 1);
+    }
+
+    // Iterators
+    iterator begin() { return iterator(*this, 0); }
+    const_iterator begin() const { return const_iterator(*this, 0); }
+    const_iterator cbegin() const { return const_iterator(*this, 0); }
+
+    iterator end() { return iterator(*this, size()); }
+    const_iterator end() const { return const_iterator(*this, size()); }
+    const_iterator cend() const { return const_iterator(*this, size()); }
+
+    reverse_iterator rbegin() { return reverse_iterator(end()); }
+    const_reverse_iterator rbegin() const { return const_reverse_iterator(end()); }
+    const_reverse_iterator crbegin() const { return const_reverse_iterator(cend()); }
+
+    reverse_iterator rend() { return reverse_iterator(begin()); }
+    const_reverse_iterator rend() const { return const_reverse_iterator(begin()); }
+    const_reverse_iterator crend() const { return const_reverse_iterator(cbegin()); }
+
+    allocator_type get_allocator() const {
+        return base_type::get_allocator();
+    }
+
+    // Storage
+    bool empty() const noexcept {
+        return 0 == size();
+    }
+
+    size_type size() const noexcept {
+        return std::min(this->my_size.load(std::memory_order_acquire), capacity());
+    }
+
+    size_type max_size() const noexcept {
+        return allocator_traits_type::max_size(base_type::get_allocator());
+    }
+
+    size_type capacity() const noexcept {
+        return base_type::capacity();
+    }
+
+    void reserve( size_type n ) {
+        if (n == 0) return;
+
+        if (n > max_size()) {
+            tbb::detail::throw_exception(exception_id::reservation_length_error);
+        }
+
+        this->assign_first_block_if_necessary(this->segment_index_of(n - 1) + 1);
+        base_type::reserve(n);
+    }
+
+    void resize( size_type n ) {
+        internal_resize(n);
+    }
+
+    void resize( size_type n, const value_type& val ) {
+        internal_resize(n, val);
+    }
+
+    void shrink_to_fit() {
+        internal_compact();
+    }
+
+    void swap(concurrent_vector& other) noexcept(is_noexcept_swap) {
+        base_type::swap(other);
+    }
+
+    void clear() {
+        destroy_elements();
+    }
+
+private:
+    using segment_type = typename base_type::segment_type;
+    using segment_table_type = typename base_type::segment_table_type;
+    using segment_table_allocator_traits = typename base_type::segment_table_allocator_traits;
+    using segment_index_type = typename base_type::segment_index_type;
+
+    using segment_element_type = typename base_type::value_type;
+    using segment_element_allocator_type = typename allocator_traits_type::template rebind_alloc<segment_element_type>;
+    using segment_element_allocator_traits = tbb::detail::allocator_traits<segment_element_allocator_type>;
+
+    segment_table_type allocate_long_table( const typename base_type::atomic_segment* embedded_table, size_type start_index ) {
+        __TBB_ASSERT(start_index <= this->embedded_table_size, "Start index out of embedded table");
+
+        // If other threads are trying to set pointers in the short segment, wait for them to finish their
+        // assignments before we copy the short segment to the long segment. Note: grow_to_at_least depends on it
+        for (segment_index_type i = 0; this->segment_base(i) < start_index; ++i) {
+            spin_wait_while_eq(embedded_table[i], segment_type(nullptr));
+        }
+
+        // It is possible that the table was extend by a thread allocating first_block, need to check this.
+        if (this->get_table() != embedded_table) {
+            return nullptr;
+        }
+
+        // Allocate long segment table and fill with null pointers
+        segment_table_type new_segment_table = segment_table_allocator_traits::allocate(base_type::get_allocator(), this->pointers_per_long_table);
+        // Copy segment pointers from the embedded table
+        for (size_type segment_index = 0; segment_index < this->pointers_per_embedded_table; ++segment_index) {
+            segment_table_allocator_traits::construct(base_type::get_allocator(), &new_segment_table[segment_index],
+                embedded_table[segment_index].load(std::memory_order_relaxed));
+        }
+        for (size_type segment_index = this->pointers_per_embedded_table; segment_index < this->pointers_per_long_table; ++segment_index) {
+            segment_table_allocator_traits::construct(base_type::get_allocator(), &new_segment_table[segment_index], nullptr);
+        }
+
+        return new_segment_table;
+    }
+
+    // create_segment function is required by the segment_table base class
+    segment_type create_segment( segment_table_type table, segment_index_type seg_index, size_type index ) {
+        size_type first_block = this->my_first_block.load(std::memory_order_relaxed);
+        // First block allocation
+        if (seg_index < first_block) {
+            // If 0 segment is already allocated, then it remains to wait until the segments are filled to requested
+            if (table[0].load(std::memory_order_acquire) != nullptr) {
+                spin_wait_while_eq(table[seg_index], segment_type(nullptr));
+                return nullptr;
+            }
+
+            segment_element_allocator_type segment_allocator(base_type::get_allocator());
+            segment_type new_segment = nullptr;
+            size_type first_block_size = this->segment_size(first_block);
+            try_call( [&] {
+                new_segment = segment_element_allocator_traits::allocate(segment_allocator, first_block_size);
+            } ).on_exception( [&] {
+                segment_type disabled_segment = nullptr;
+                if (table[0].compare_exchange_strong(disabled_segment, this->segment_allocation_failure_tag)) {
+                    size_type end_segment = table == this->my_embedded_table ? this->pointers_per_embedded_table : first_block;
+                    for (size_type i = 1; i < end_segment; ++i) {
+                        table[i].store(this->segment_allocation_failure_tag, std::memory_order_release);
+                    }
+                }
+            });
+
+            segment_type disabled_segment = nullptr;
+            if (table[0].compare_exchange_strong(disabled_segment, new_segment)) {
+                this->extend_table_if_necessary(table, 0, first_block_size);
+                for (size_type i = 1; i < first_block; ++i) {
+                    table[i].store(new_segment, std::memory_order_release);
+                }
+
+                // Other threads can wait on a snapshot of an embedded table, need to fill it.
+                for (size_type i = 1; i < first_block && i < this->pointers_per_embedded_table; ++i) {
+                    this->my_embedded_table[i].store(new_segment, std::memory_order_release);
+                }
+            } else if (new_segment != this->segment_allocation_failure_tag) {
+                // Deallocate the memory
+                segment_element_allocator_traits::deallocate(segment_allocator, new_segment, first_block_size);
+                // 0 segment is already allocated, then it remains to wait until the segments are filled to requested
+                spin_wait_while_eq(table[seg_index], segment_type(nullptr));
+            }
+        } else {
+            size_type offset = this->segment_base(seg_index);
+            if (index == offset) {
+                __TBB_ASSERT(table[seg_index].load(std::memory_order_relaxed) == nullptr, "Only this thread can enable this segment");
+                segment_element_allocator_type segment_allocator(base_type::get_allocator());
+                segment_type new_segment = this->segment_allocation_failure_tag;
+                try_call( [&] {
+                    new_segment = segment_element_allocator_traits::allocate(segment_allocator,this->segment_size(seg_index));
+                    // Shift base address to simplify access by index
+                    new_segment -= this->segment_base(seg_index);
+                } ).on_completion( [&] {
+                    table[seg_index].store(new_segment, std::memory_order_release);
+                });
+            } else {
+                spin_wait_while_eq(table[seg_index], segment_type(nullptr));
+            }
+        }
+        return nullptr;
+    }
+
+    // Returns the number of elements in the segment to be destroy
+    size_type number_of_elements_in_segment( segment_index_type seg_index ) {
+        size_type curr_vector_size = this->my_size.load(std::memory_order_relaxed);
+        size_type curr_segment_base = this->segment_base(seg_index);
+
+        if (seg_index == 0) {
+            return std::min(curr_vector_size, this->segment_size(seg_index));
+        } else {
+            // Perhaps the segment is allocated, but there are no elements in it.
+            if (curr_vector_size < curr_segment_base) {
+                return 0;
+            }
+            return curr_segment_base * 2 > curr_vector_size ? curr_vector_size - curr_segment_base : curr_segment_base;
+        }
+    }
+
+    void deallocate_segment( segment_type address, segment_index_type seg_index ) {
+        segment_element_allocator_type segment_allocator(base_type::get_allocator());
+        size_type first_block = this->my_first_block.load(std::memory_order_relaxed);
+        if (seg_index >= first_block) {
+            segment_element_allocator_traits::deallocate(segment_allocator, address, this->segment_size(seg_index));
+        }
+        else if (seg_index == 0) {
+            size_type elements_to_deallocate = first_block > 0 ? this->segment_size(first_block) : this->segment_size(0);
+            segment_element_allocator_traits::deallocate(segment_allocator, address, elements_to_deallocate);
+        }
+    }
+
+    // destroy_segment function is required by the segment_table base class
+    void destroy_segment( segment_type address, segment_index_type seg_index ) {
+        size_type elements_to_destroy = number_of_elements_in_segment(seg_index);
+        segment_element_allocator_type segment_allocator(base_type::get_allocator());
+
+        for (size_type i = 0; i < elements_to_destroy; ++i) {
+            segment_element_allocator_traits::destroy(segment_allocator, address + i);
+        }
+
+        deallocate_segment(address, seg_index);
+    }
+
+    // copy_segment function is required by the segment_table base class
+    void copy_segment( segment_index_type seg_index, segment_type from, segment_type to ) {
+        size_type i = 0;
+        try_call( [&] {
+            for (; i != number_of_elements_in_segment(seg_index); ++i) {
+                segment_table_allocator_traits::construct(base_type::get_allocator(), to + i, from[i]);
+            }
+        } ).on_exception( [&] {
+            // Zero-initialize items left not constructed after the exception
+            zero_unconstructed_elements(this->get_segment(seg_index) + i, this->segment_size(seg_index) - i);
+
+            segment_index_type last_segment = this->segment_index_of(this->my_size.load(std::memory_order_relaxed));
+            auto table = this->get_table();
+            for (segment_index_type j = seg_index + 1; j != last_segment; ++j) {
+                auto curr_segment = table[j].load(std::memory_order_relaxed);
+                if (curr_segment) {
+                    zero_unconstructed_elements(curr_segment + this->segment_base(j), this->segment_size(j));
+                }
+            }
+            this->my_size.store(this->segment_size(seg_index) + i, std::memory_order_relaxed);
+        });
+    }
+
+    // move_segment function is required by the segment_table base class
+    void move_segment( segment_index_type seg_index, segment_type from, segment_type to ) {
+        size_type i = 0;
+        try_call( [&] {
+            for (; i != number_of_elements_in_segment(seg_index); ++i) {
+                segment_table_allocator_traits::construct(base_type::get_allocator(), to + i, std::move(from[i]));
+            }
+        } ).on_exception( [&] {
+            // Zero-initialize items left not constructed after the exception
+            zero_unconstructed_elements(this->get_segment(seg_index) + i, this->segment_size(seg_index) - i);
+
+            segment_index_type last_segment = this->segment_index_of(this->my_size.load(std::memory_order_relaxed));
+            auto table = this->get_table();
+            for (segment_index_type j = seg_index + 1; j != last_segment; ++j) {
+                auto curr_segment = table[j].load(std::memory_order_relaxed);
+                if (curr_segment) {
+                    zero_unconstructed_elements(curr_segment + this->segment_base(j), this->segment_size(j));
+                }
+            }
+            this->my_size.store(this->segment_size(seg_index) + i, std::memory_order_relaxed);
+        });
+    }
+
+    static constexpr bool is_first_element_in_segment( size_type index ) {
+        // An element is the first in a segment if its index is equal to a power of two
+        return is_power_of_two_at_least(index, 2);
+    }
+
+    const_reference internal_subscript( size_type index ) const {
+        return const_cast<self_type*>(this)->internal_subscript(index);
+    }
+
+    reference internal_subscript( size_type index ) {
+        __TBB_ASSERT(index < this->my_size.load(std::memory_order_relaxed), "Invalid subscript index");
+        return base_type::template internal_subscript</*allow_out_of_range_access=*/false>(index);
+    }
+
+    const_reference internal_subscript_with_exceptions( size_type index ) const {
+        return const_cast<self_type*>(this)->internal_subscript_with_exceptions(index);
+    }
+
+    reference internal_subscript_with_exceptions( size_type index ) {
+        if (index >= this->my_size.load(std::memory_order_acquire)) {
+            tbb::detail::throw_exception(exception_id::out_of_range);
+        }
+
+        segment_table_type table = this->my_segment_table.load(std::memory_order_acquire);
+
+        size_type seg_index = this->segment_index_of(index);
+        if (base_type::number_of_segments(table) < seg_index) {
+            tbb::detail::throw_exception(exception_id::out_of_range);
+        }
+
+        if (table[seg_index] <= this->segment_allocation_failure_tag) {
+            tbb::detail::throw_exception(exception_id::out_of_range);
+        }
+
+        return base_type::template internal_subscript</*allow_out_of_range_access=*/false>(index);
+    }
+
+    static void zero_unconstructed_elements( pointer start, size_type count ) {
+        std::memset(static_cast<void *>(start), 0, count * sizeof(value_type));
+    }
+
+    template <typename... Args>
+    iterator internal_emplace_back( Args&&... args ) {
+        size_type old_size = this->my_size++;
+        this->assign_first_block_if_necessary(default_first_block_size);
+        auto element_address = &base_type::template internal_subscript</*allow_out_of_range_access=*/true>(old_size);
+
+        // try_call API is not convenient here due to broken
+        // variadic capture on GCC 4.8.5
+        auto value_guard = make_raii_guard([&] {
+            zero_unconstructed_elements(element_address, /*count =*/1);
+        });
+
+        segment_table_allocator_traits::construct(base_type::get_allocator(), element_address, std::forward<Args>(args)...);
+        value_guard.dismiss();
+        return iterator(*this, old_size, element_address);
+    }
+
+    template <typename... Args>
+    void internal_loop_construct( segment_table_type table, size_type start_idx, size_type end_idx, const Args&... args ) {
+        static_assert(sizeof...(Args) < 2, "Too many parameters");
+        for (size_type idx = start_idx; idx < end_idx; ++idx) {
+            auto element_address = &base_type::template internal_subscript</*allow_out_of_range_access=*/true>(idx);
+            // try_call API is not convenient here due to broken
+            // variadic capture on GCC 4.8.5
+            auto value_guard = make_raii_guard( [&] {
+                segment_index_type last_allocated_segment = this->find_last_allocated_segment(table);
+                size_type segment_size = this->segment_size(last_allocated_segment);
+                end_idx = end_idx < segment_size ? end_idx : segment_size;
+                for (size_type i = idx; i < end_idx; ++i) {
+                    zero_unconstructed_elements(&this->internal_subscript(i), /*count =*/1);
+                }
+            });
+            segment_table_allocator_traits::construct(base_type::get_allocator(), element_address, args...);
+            value_guard.dismiss();
+        }
+    }
+
+    template <typename ForwardIterator>
+    void internal_loop_construct( segment_table_type table, size_type start_idx, size_type end_idx, ForwardIterator first, ForwardIterator ) {
+        for (size_type idx = start_idx; idx < end_idx; ++idx) {
+            auto element_address = &base_type::template internal_subscript</*allow_out_of_range_access=*/true>(idx);
+            try_call( [&] {
+                segment_table_allocator_traits::construct(base_type::get_allocator(), element_address, *first++);
+            } ).on_exception( [&] {
+                segment_index_type last_allocated_segment = this->find_last_allocated_segment(table);
+                size_type segment_size = this->segment_size(last_allocated_segment);
+                end_idx = end_idx < segment_size ? end_idx : segment_size;
+                for (size_type i = idx; i < end_idx; ++i) {
+                    zero_unconstructed_elements(&this->internal_subscript(i), /*count =*/1);
+                }
+            });
+        }
+    }
+
+    template <typename... Args>
+    iterator internal_grow( size_type start_idx, size_type end_idx, const Args&... args ) {
+        this->assign_first_block_if_necessary(this->segment_index_of(end_idx - 1) + 1);
+        size_type seg_index = this->segment_index_of(end_idx - 1);
+        segment_table_type table = this->get_table();
+        this->extend_table_if_necessary(table, start_idx, end_idx);
+
+        if (seg_index > this->my_first_block.load(std::memory_order_relaxed)) {
+            // So that other threads be able to work with the last segment of grow_by, allocate it immediately.
+            // If the last segment is not less than the first block
+            if (table[seg_index].load(std::memory_order_relaxed) == nullptr) {
+                size_type first_element = this->segment_base(seg_index);
+                if (first_element >= start_idx && first_element < end_idx) {
+                    segment_type segment = table[seg_index].load(std::memory_order_relaxed);
+                    base_type::enable_segment(segment, table, seg_index, first_element);
+                }
+            }
+        }
+
+        internal_loop_construct(table, start_idx, end_idx, args...);
+
+        return iterator(*this, start_idx, &base_type::template internal_subscript</*allow_out_of_range_access=*/false>(start_idx));
+    }
+
+
+    template <typename... Args>
+    iterator internal_grow_by_delta( size_type delta, const Args&... args ) {
+        if (delta == size_type(0)) {
+            return end();
+        }
+        size_type start_idx = this->my_size.fetch_add(delta);
+        size_type end_idx = start_idx + delta;
+
+        return internal_grow(start_idx, end_idx, args...);
+    }
+
+    template <typename... Args>
+    iterator internal_grow_to_at_least( size_type new_size, const Args&... args ) {
+        size_type old_size = this->my_size.load(std::memory_order_relaxed);
+        if (new_size == size_type(0)) return iterator(*this, 0);
+        while (old_size < new_size && !this->my_size.compare_exchange_weak(old_size, new_size))
+        {}
+
+        int delta = static_cast<int>(new_size) - static_cast<int>(old_size);
+        if (delta > 0) {
+            return internal_grow(old_size, new_size, args...);
+        }
+
+        size_type end_segment = this->segment_index_of(new_size - 1);
+
+        // Check/wait for segments allocation completes
+        if (end_segment >= this->pointers_per_embedded_table &&
+            this->get_table() == this->my_embedded_table)
+        {
+            spin_wait_while_eq(this->my_segment_table, this->my_embedded_table);
+        }
+
+        for (segment_index_type seg_idx = 0; seg_idx <= end_segment; ++seg_idx) {
+            if (this->get_table()[seg_idx].load(std::memory_order_relaxed) == nullptr) {
+                atomic_backoff backoff(true);
+                while (this->get_table()[seg_idx].load(std::memory_order_relaxed) == nullptr) {
+                    backoff.pause();
+                }
+            }
+        }
+
+    #if TBB_USE_DEBUG
+        size_type cap = capacity();
+        __TBB_ASSERT( cap >= new_size, NULL);
+    #endif
+        return iterator(*this, size());
+    }
+
+    template <typename... Args>
+    void internal_resize( size_type n, const Args&... args ) {
+        if (n == 0) {
+            clear();
+            return;
+        }
+
+        size_type old_size = this->my_size.load(std::memory_order_acquire);
+        if (n > old_size) {
+            reserve(n);
+            grow_to_at_least(n, args...);
+        } else {
+            if (old_size == n) {
+                return;
+            }
+            size_type last_segment = this->segment_index_of(old_size - 1);
+            // Delete segments
+            for (size_type seg_idx = this->segment_index_of(n - 1) + 1; seg_idx <= last_segment; ++seg_idx) {
+                this->delete_segment(seg_idx);
+            }
+
+            // If n > segment_size(n) => we need to destroy all of the items in the first segment
+            // Otherwise, we need to destroy only items with the index < n
+            size_type n_segment = this->segment_index_of(n - 1);
+            size_type last_index_to_destroy = std::min(this->segment_base(n_segment) + this->segment_size(n_segment), old_size);
+            // Destroy elements in curr segment
+            for (size_type idx = n; idx < last_index_to_destroy; ++idx) {
+                segment_table_allocator_traits::destroy(base_type::get_allocator(), &base_type::template internal_subscript</*allow_out_of_range_access=*/false>(idx));
+            }
+            this->my_size.store(n, std::memory_order_release);
+        }
+    }
+
+    void destroy_elements() {
+        allocator_type alloc(base_type::get_allocator());
+        for (size_type i = 0; i < this->my_size.load(std::memory_order_relaxed); ++i) {
+            allocator_traits_type::destroy(alloc, &base_type::template internal_subscript</*allow_out_of_range_access=*/false>(i));
+        }
+        this->my_size.store(0, std::memory_order_relaxed);
+    }
+
+    static bool incompact_predicate( size_type size ) {
+        // memory page size
+        const size_type page_size = 4096;
+        return size < page_size || ((size - 1) % page_size < page_size / 2 && size < page_size * 128);
+    }
+
+    void internal_compact() {
+        const size_type curr_size = this->my_size.load(std::memory_order_relaxed);
+        segment_table_type table = this->get_table();
+        const segment_index_type k_end = this->find_last_allocated_segment(table);                   // allocated segments
+        const segment_index_type k_stop = curr_size ? this->segment_index_of(curr_size - 1) + 1 : 0; // number of segments to store existing items: 0=>0; 1,2=>1; 3,4=>2; [5-8]=>3;..
+        const segment_index_type first_block = this->my_first_block;                                 // number of merged segments, getting values from atomics
+
+        segment_index_type k = first_block;
+        if (k_stop < first_block) {
+            k = k_stop;
+        }
+        else {
+            while (k < k_stop && incompact_predicate(this->segment_size(k) * sizeof(value_type))) k++;
+        }
+
+        if (k_stop == k_end && k == first_block) {
+            return;
+        }
+
+        // First segment optimization
+        if (k != first_block && k) {
+            size_type max_block = std::max(first_block, k);
+
+            auto buffer_table = segment_table_allocator_traits::allocate(base_type::get_allocator(), max_block);
+
+            for (size_type seg_idx = 0; seg_idx < max_block; ++seg_idx) {
+                segment_table_allocator_traits::construct(base_type::get_allocator(), &buffer_table[seg_idx],
+                    table[seg_idx].load(std::memory_order_relaxed));
+                table[seg_idx].store(nullptr, std::memory_order_relaxed);
+            }
+
+            this->my_first_block.store(k, std::memory_order_relaxed);
+            size_type index = 0;
+            try_call( [&] {
+                for (; index < std::min(this->segment_size(max_block), curr_size); ++index) {
+                    auto element_address = &static_cast<base_type*>(this)->operator[](index);
+                    segment_index_type seg_idx = this->segment_index_of(index);
+                    segment_table_allocator_traits::construct(base_type::get_allocator(), element_address,
+                    std::move_if_noexcept(buffer_table[seg_idx].load(std::memory_order_relaxed)[index]));
+                }
+            } ).on_exception( [&] {
+                segment_element_allocator_type allocator(base_type::get_allocator());
+                for (size_type i = 0; i < index; ++i) {
+                    auto element_adress = &this->operator[](i);
+                    segment_element_allocator_traits::destroy(allocator, element_adress);
+                }
+                segment_element_allocator_traits::deallocate(allocator,
+                    table[0].load(std::memory_order_relaxed), this->segment_size(max_block));
+
+                for (size_type seg_idx = 0; seg_idx < max_block; ++seg_idx) {
+                    table[seg_idx].store(buffer_table[seg_idx].load(std::memory_order_relaxed),
+                        std::memory_order_relaxed);
+                    buffer_table[seg_idx].store(nullptr, std::memory_order_relaxed);
+                }
+                segment_table_allocator_traits::deallocate(base_type::get_allocator(),
+                    buffer_table, max_block);
+                this->my_first_block.store(first_block, std::memory_order_relaxed);
+            });
+
+            // Need to correct deallocate old segments
+            // Method destroy_segment respect active first_block, therefore,
+            // in order for the segment deletion to work correctly, set the first_block size that was earlier,
+            // destroy the unnecessary segments.
+            this->my_first_block.store(first_block, std::memory_order_relaxed);
+            for (size_type seg_idx = max_block; seg_idx > 0 ; --seg_idx) {
+                auto curr_segment = buffer_table[seg_idx - 1].load(std::memory_order_relaxed);
+                if (curr_segment != nullptr) {
+                    destroy_segment(buffer_table[seg_idx - 1].load(std::memory_order_relaxed) + this->segment_base(seg_idx - 1),
+                        seg_idx - 1);
+                }
+            }
+
+            this->my_first_block.store(k, std::memory_order_relaxed);
+
+            for (size_type seg_idx = 0; seg_idx < max_block; ++seg_idx) {
+                segment_table_allocator_traits::destroy(base_type::get_allocator(), &buffer_table[seg_idx]);
+            }
+
+            segment_table_allocator_traits::deallocate(base_type::get_allocator(), buffer_table, max_block);
+        }
+        // free unnecessary segments allocated by reserve() call
+        if (k_stop < k_end) {
+            for (size_type seg_idx = k_end; seg_idx != k_stop; --seg_idx) {
+                if (table[seg_idx - 1].load(std::memory_order_relaxed) != nullptr) {
+                    this->delete_segment(seg_idx - 1);
+                }
+            }
+            if (!k) this->my_first_block.store(0, std::memory_order_relaxed);;
+        }
+    }
+
+    // Lever for adjusting the size of first_block at the very first insertion.
+    // TODO: consider >1 value, check performance
+    static constexpr size_type default_first_block_size = 1;
+
+    template <typename Vector, typename Value>
+    friend class vector_iterator;
+}; // class concurrent_vector
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+// Deduction guide for the constructor from two iterators
+template <typename It, typename Alloc = tbb::cache_aligned_allocator<iterator_value_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_vector( It, It, Alloc = Alloc() )
+-> concurrent_vector<iterator_value_t<It>, Alloc>;
+#endif
+
+template <typename T, typename Allocator>
+void swap(concurrent_vector<T, Allocator> &lhs,
+          concurrent_vector<T, Allocator> &rhs)
+{
+    lhs.swap(rhs);
+}
+
+template <typename T, typename Allocator>
+bool operator==(const concurrent_vector<T, Allocator> &lhs,
+                const concurrent_vector<T, Allocator> &rhs)
+{
+    return lhs.size() == rhs.size() && std::equal(lhs.begin(), lhs.end(), rhs.begin());
+}
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+template <typename T, typename Allocator>
+bool operator!=(const concurrent_vector<T, Allocator> &lhs,
+                const concurrent_vector<T, Allocator> &rhs)
+{
+    return !(lhs == rhs);
+}
+#endif // !__TBB_CPP20_COMPARISONS_PRESENT
+
+#if __TBB_CPP20_COMPARISONS_PRESENT && __TBB_CPP20_CONCEPTS_PRESENT
+template <typename T, typename Allocator>
+tbb::detail::synthesized_three_way_result<typename concurrent_vector<T, Allocator>::value_type>
+operator<=>(const concurrent_vector<T, Allocator> &lhs,
+            const concurrent_vector<T, Allocator> &rhs)
+{
+    return std::lexicographical_compare_three_way(lhs.begin(), lhs.end(),
+                                                  rhs.begin(), rhs.end(),
+                                                  tbb::detail::synthesized_three_way_comparator{});
+}
+
+#else
+
+template <typename T, typename Allocator>
+bool operator<(const concurrent_vector<T, Allocator> &lhs,
+               const concurrent_vector<T, Allocator> &rhs)
+{
+    return std::lexicographical_compare(lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
+}
+
+template <typename T, typename Allocator>
+bool operator<=(const concurrent_vector<T, Allocator> &lhs,
+                const concurrent_vector<T, Allocator> &rhs)
+{
+    return !(rhs < lhs);
+}
+
+template <typename T, typename Allocator>
+bool operator>(const concurrent_vector<T, Allocator> &lhs,
+               const concurrent_vector<T, Allocator> &rhs)
+{
+    return rhs < lhs;
+}
+
+template <typename T, typename Allocator>
+bool operator>=(const concurrent_vector<T, Allocator> &lhs,
+                const concurrent_vector<T, Allocator> &rhs)
+{
+    return !(lhs < rhs);
+}
+#endif // __TBB_CPP20_COMPARISONS_PRESENT && __TBB_CPP20_CONCEPTS_PRESENT
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+    using detail::d1::concurrent_vector;
+} // namespace v1
+
+} // namespace tbb
+
+#endif // __TBB_concurrent_vector_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_aggregator.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_aggregator.h
new file mode 100644
index 0000000000..40ba64e43d
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_aggregator.h
@@ -0,0 +1,173 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+
+#ifndef __TBB_detail__aggregator_H
+#define __TBB_detail__aggregator_H
+
+#include "_assert.h"
+#include "_utils.h"
+#include <atomic>
+#if !__TBBMALLOC_BUILD // TODO: check this macro with TBB Malloc
+#include "../profiling.h"
+#endif
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+// Base class for aggregated operation
+template <typename Derived>
+class aggregated_operation {
+public:
+    // Zero value means "wait" status, all other values are "user" specified values and
+    // are defined into the scope of a class which uses "status"
+    std::atomic<uintptr_t> status;
+
+    std::atomic<Derived*> next;
+    aggregated_operation() : status{}, next(nullptr) {}
+}; // class aggregated_operation
+
+// Aggregator base class
+/* An aggregator for collecting operations coming from multiple sources and executing
+   them serially on a single thread.  OperationType must be derived from
+   aggregated_operation. The parameter HandlerType is a functor that will be passed the
+   list of operations and is expected to handle each operation appropriately, setting the
+   status of each operation to non-zero. */
+template <typename OperationType>
+class aggregator_generic {
+public:
+    aggregator_generic() : pending_operations(nullptr), handler_busy(false) {}
+
+    // Execute an operation
+    /* Places an operation into the waitlist (pending_operations), and either handles the list,
+       or waits for the operation to complete, or returns.
+       The long_life_time parameter specifies the life time of the given operation object.
+       Operations with long_life_time == true may be accessed after execution.
+       A "short" life time operation (long_life_time == false) can be destroyed
+       during execution, and so any access to it after it was put into the waitlist,
+       including status check, is invalid. As a consequence, waiting for completion
+       of such operation causes undefined behavior. */
+    template <typename HandlerType>
+    void execute( OperationType* op, HandlerType& handle_operations, bool long_life_time = true ) {
+        // op->status should be read before inserting the operation into the
+        // aggregator waitlist since it can become invalid after executing a
+        // handler (if the operation has 'short' life time.)
+        const uintptr_t status = op->status.load(std::memory_order_relaxed);
+
+        // ITT note: &(op->status) tag is used to cover accesses to this op node. This
+        // thread has created the operation, and now releases it so that the handler
+        // thread may handle the associated operation w/o triggering a race condition;
+        // thus this tag will be acquired just before the operation is handled in the
+        // handle_operations functor.
+        call_itt_notify(releasing, &(op->status));
+        // insert the operation in the queue.
+        OperationType* res = pending_operations.load(std::memory_order_relaxed);
+        do {
+            op->next.store(res, std::memory_order_relaxed);
+        } while (!pending_operations.compare_exchange_strong(res, op));
+        if (!res) { // first in the list; handle the operations
+            // ITT note: &pending_operations tag covers access to the handler_busy flag,
+            // which this waiting handler thread will try to set before entering
+            // handle_operations.
+            call_itt_notify(acquired, &pending_operations);
+            start_handle_operations(handle_operations);
+            // The operation with 'short' life time can already be destroyed
+            if (long_life_time)
+                __TBB_ASSERT(op->status.load(std::memory_order_relaxed), NULL);
+        }
+        // Not first; wait for op to be ready
+        else if (!status) { // operation is blocking here.
+            __TBB_ASSERT(long_life_time, "Waiting for an operation object that might be destroyed during processing");
+            call_itt_notify(prepare, &(op->status));
+            spin_wait_while_eq(op->status, uintptr_t(0));
+        }
+   }
+
+private:
+    // Trigger the handling of operations when the handler is free
+    template <typename HandlerType>
+    void start_handle_operations( HandlerType& handle_operations ) {
+        OperationType* op_list;
+
+        // ITT note: &handler_busy tag covers access to pending_operations as it is passed
+        // between active and waiting handlers.  Below, the waiting handler waits until
+        // the active handler releases, and the waiting handler acquires &handler_busy as
+        // it becomes the active_handler. The release point is at the end of this
+        // function, when all operations in pending_operations have been handled by the
+        // owner of this aggregator.
+        call_itt_notify(prepare, &handler_busy);
+        // get the handler_busy:
+        // only one thread can possibly spin here at a time
+        spin_wait_until_eq(handler_busy, uintptr_t(0));
+        call_itt_notify(acquired, &handler_busy);
+        // acquire fence not necessary here due to causality rule and surrounding atomics
+        handler_busy.store(1, std::memory_order_relaxed);
+
+        // ITT note: &pending_operations tag covers access to the handler_busy flag
+        // itself. Capturing the state of the pending_operations signifies that
+        // handler_busy has been set and a new active handler will now process that list's
+        // operations.
+        call_itt_notify(releasing, &pending_operations);
+        // grab pending_operations
+        op_list = pending_operations.exchange(nullptr);
+
+        // handle all the operations
+        handle_operations(op_list);
+
+        // release the handler
+        handler_busy.store(0, std::memory_order_release);
+    }
+
+    // An atomically updated list (aka mailbox) of pending operations
+    std::atomic<OperationType*> pending_operations;
+    // Controls threads access to handle_operations
+    std::atomic<uintptr_t> handler_busy;
+}; // class aggregator_generic
+
+template <typename HandlerType, typename OperationType>
+class aggregator : public aggregator_generic<OperationType> {
+    HandlerType handle_operations;
+public:
+    aggregator() = default;
+
+    void initialize_handler( HandlerType h ) { handle_operations = h; }
+
+    void execute(OperationType* op) {
+        aggregator_generic<OperationType>::execute(op, handle_operations);
+    }
+}; // class aggregator
+
+// the most-compatible friend declaration (vs, gcc, icc) is
+// template<class U, class V> friend class aggregating_functor;
+template <typename AggregatingClass, typename OperationList>
+class aggregating_functor {
+    AggregatingClass* my_object;
+public:
+    aggregating_functor() = default;
+    aggregating_functor( AggregatingClass* object ) : my_object(object) {
+        __TBB_ASSERT(my_object, nullptr);
+    }
+
+    void operator()( OperationList* op_list ) { my_object->handle_operations(op_list); }
+}; // class aggregating_functor
+
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__aggregator_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_aligned_space.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_aligned_space.h
new file mode 100644
index 0000000000..13857c47cc
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_aligned_space.h
@@ -0,0 +1,46 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+#ifndef __TBB_aligned_space_H
+#define __TBB_aligned_space_H
+
+#include <cstddef>
+
+#include "_template_helpers.h"
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+
+//! Block of space aligned sufficiently to construct an array T with N elements.
+/** The elements are not constructed or destroyed by this class.
+    @ingroup memory_allocation */
+template<typename T, std::size_t N = 1>
+class aligned_space {
+    alignas(alignof(T)) std::uint8_t aligned_array[N * sizeof(T)];
+
+public:
+    //! Pointer to beginning of array
+    T* begin() const { return punned_cast<T*>(&aligned_array); }
+
+    //! Pointer to one past last element in array.
+    T* end() const { return begin() + N; }
+};
+
+} // namespace d0
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB_aligned_space_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_allocator_traits.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_allocator_traits.h
new file mode 100644
index 0000000000..8c60e25e7e
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_allocator_traits.h
@@ -0,0 +1,107 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__allocator_traits_H
+#define __TBB_detail__allocator_traits_H
+
+#include "_config.h"
+#include "_template_helpers.h"
+#include <memory>
+#include <type_traits>
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+
+#if !__TBB_CPP17_ALLOCATOR_IS_ALWAYS_EQUAL_PRESENT
+// Struct is_always_equal_detector provides the member type "type" which is
+// Allocator::is_always_equal if it is present, std::false_type otherwise
+template <typename Allocator, typename = void>
+struct is_always_equal_detector {
+    using type = std::false_type;
+};
+
+template <typename Allocator>
+struct is_always_equal_detector<Allocator, tbb::detail::void_t<typename Allocator::is_always_equal>>
+{
+    using type = typename Allocator::is_always_equal;
+};
+#endif // !__TBB_CPP17_ALLOCATOR_IS_ALWAYS_EQUAL_PRESENT
+
+template <typename Allocator>
+class allocator_traits : public std::allocator_traits<Allocator>
+{
+    using base_type = std::allocator_traits<Allocator>;
+public:
+#if !__TBB_CPP17_ALLOCATOR_IS_ALWAYS_EQUAL_PRESENT
+    using is_always_equal = typename is_always_equal_detector<Allocator>::type;
+#endif
+
+    template <typename T>
+    using rebind_traits = typename tbb::detail::allocator_traits<typename base_type::template rebind_alloc<T>>;
+}; // struct allocator_traits
+
+template <typename Allocator>
+void copy_assign_allocators_impl( Allocator& lhs, const Allocator& rhs, /*pocca = */std::true_type ) {
+    lhs = rhs;
+}
+
+template <typename Allocator>
+void copy_assign_allocators_impl( Allocator&, const Allocator&, /*pocca = */ std::false_type ) {}
+
+// Copy assigns allocators only if propagate_on_container_copy_assignment is true
+template <typename Allocator>
+void copy_assign_allocators( Allocator& lhs, const Allocator& rhs ) {
+    using pocca_type = typename allocator_traits<Allocator>::propagate_on_container_copy_assignment;
+    copy_assign_allocators_impl(lhs, rhs, pocca_type());
+}
+
+template <typename Allocator>
+void move_assign_allocators_impl( Allocator& lhs, Allocator& rhs, /*pocma = */ std::true_type ) {
+    lhs = std::move(rhs);
+}
+
+template <typename Allocator>
+void move_assign_allocators_impl( Allocator&, Allocator&, /*pocma = */ std::false_type ) {}
+
+// Move assigns allocators only if propagate_on_container_move_assignment is true
+template <typename Allocator>
+void move_assign_allocators( Allocator& lhs, Allocator& rhs ) {
+    using pocma_type = typename allocator_traits<Allocator>::propagate_on_container_move_assignment;
+    move_assign_allocators_impl(lhs, rhs, pocma_type());
+}
+
+template <typename Allocator>
+void swap_allocators_impl( Allocator& lhs, Allocator& rhs, /*pocs = */ std::true_type ) {
+    using std::swap;
+    swap(lhs, rhs);
+}
+
+template <typename Allocator>
+void swap_allocators_impl( Allocator&, Allocator&, /*pocs = */ std::false_type ) {}
+
+// Swaps allocators only if propagate_on_container_swap is true
+template <typename Allocator>
+void swap_allocators( Allocator& lhs, Allocator& rhs ) {
+    using pocs_type = typename allocator_traits<Allocator>::propagate_on_container_swap;
+    swap_allocators_impl(lhs, rhs, pocs_type());
+}
+
+} // inline namespace d0
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__allocator_traits_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_assert.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_assert.h
new file mode 100644
index 0000000000..4116386a92
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_assert.h
@@ -0,0 +1,52 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__assert_H
+#define __TBB_detail__assert_H
+
+#include "_config.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+//! Process an assertion failure.
+/** Normally called from __TBB_ASSERT macro.
+  If assertion handler is null, print message for assertion failure and abort.
+  Otherwise call the assertion handler. */
+void __TBB_EXPORTED_FUNC assertion_failure(const char* filename, int line, const char* expression, const char* comment);
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+//! Release version of assertions
+#define __TBB_ASSERT_RELEASE(predicate,message) ((predicate)?((void)0) : tbb::detail::r1::assertion_failure(__FILE__,__LINE__,#predicate,message))
+
+#if TBB_USE_ASSERT
+    //! Assert that predicate is true.
+    /** If predicate is false, print assertion failure message.
+        If the comment argument is not NULL, it is printed as part of the failure message.
+        The comment argument has no other effect. */
+    #define __TBB_ASSERT(predicate,message) __TBB_ASSERT_RELEASE(predicate,message)
+    //! "Extended" version
+    #define __TBB_ASSERT_EX __TBB_ASSERT
+#else
+    //! No-op version of __TBB_ASSERT.
+    #define __TBB_ASSERT(predicate,comment) ((void)0)
+    //! "Extended" version is useful to suppress warnings if a variable is only used with an assert
+    #define __TBB_ASSERT_EX(predicate,comment) ((void)(1 && (predicate)))
+#endif // TBB_USE_ASSERT
+
+#endif // __TBB_detail__assert_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_concurrent_queue_base.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_concurrent_queue_base.h
new file mode 100644
index 0000000000..6289632601
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_concurrent_queue_base.h
@@ -0,0 +1,659 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__concurrent_queue_base_H
+#define __TBB_detail__concurrent_queue_base_H
+
+#include "_utils.h"
+#include "_exception.h"
+#include "_machine.h"
+#include "_allocator_traits.h"
+
+#include "../profiling.h"
+#include "../spin_mutex.h"
+#include "../cache_aligned_allocator.h"
+
+#include <atomic>
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+using ticket_type = std::size_t;
+
+template <typename Page>
+inline bool is_valid_page(const Page p) {
+    return reinterpret_cast<std::uintptr_t>(p) > 1;
+}
+
+template <typename T, typename Allocator>
+struct concurrent_queue_rep;
+
+template <typename Container, typename T, typename Allocator>
+class micro_queue_pop_finalizer;
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+// unary minus operator applied to unsigned type, result still unsigned
+#pragma warning( push )
+#pragma warning( disable: 4146 )
+#endif
+
+// A queue using simple locking.
+// For efficiency, this class has no constructor.
+// The caller is expected to zero-initialize it.
+template <typename T, typename Allocator>
+class micro_queue {
+private:
+    using queue_rep_type = concurrent_queue_rep<T, Allocator>;
+    using self_type = micro_queue<T, Allocator>;
+public:
+    using size_type = std::size_t;
+    using value_type = T;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+
+    using allocator_type = Allocator;
+    using allocator_traits_type = tbb::detail::allocator_traits<allocator_type>;
+
+    static constexpr size_type item_size = sizeof(T);
+    static constexpr size_type items_per_page = item_size <=   8 ? 32 :
+                                                item_size <=  16 ? 16 :
+                                                item_size <=  32 ?  8 :
+                                                item_size <=  64 ?  4 :
+                                                item_size <= 128 ?  2 : 1;
+
+    struct padded_page {
+        padded_page() {}
+        ~padded_page() {}
+
+        reference operator[] (std::size_t index) {
+            __TBB_ASSERT(index < items_per_page, "Index out of range");
+            return items[index];
+        }
+
+        const_reference operator[] (std::size_t index) const {
+            __TBB_ASSERT(index < items_per_page, "Index out of range");
+            return items[index];
+        }
+
+        padded_page* next{ nullptr };
+        std::atomic<std::uintptr_t> mask{};
+
+        union {
+            value_type items[items_per_page];
+        };
+    }; // struct padded_page
+
+    using page_allocator_type = typename allocator_traits_type::template rebind_alloc<padded_page>;
+protected:
+    using page_allocator_traits = tbb::detail::allocator_traits<page_allocator_type>;
+
+public:
+    using item_constructor_type = void (*)(value_type* location, const void* src);
+    micro_queue() = default;
+    micro_queue( const micro_queue& ) = delete;
+    micro_queue& operator=( const micro_queue& ) = delete;
+
+    size_type prepare_page( ticket_type k, queue_rep_type& base, page_allocator_type page_allocator,
+                            padded_page*& p ) {
+        __TBB_ASSERT(p == nullptr, "Invalid page argument for prepare_page");
+        k &= -queue_rep_type::n_queue;
+        size_type index = modulo_power_of_two(k / queue_rep_type::n_queue, items_per_page);
+        if (!index) {
+            try_call( [&] {
+                p = page_allocator_traits::allocate(page_allocator, 1);
+            }).on_exception( [&] {
+                ++base.n_invalid_entries;
+                invalidate_page( k );
+            });
+            page_allocator_traits::construct(page_allocator, p);
+        }
+
+        if (tail_counter.load(std::memory_order_relaxed) != k) spin_wait_until_my_turn(tail_counter, k, base);
+        call_itt_notify(acquired, &tail_counter);
+
+        if (p) {
+            spin_mutex::scoped_lock lock( page_mutex );
+            padded_page* q = tail_page.load(std::memory_order_relaxed);
+            if (is_valid_page(q)) {
+                q->next = p;
+            } else {
+                head_page.store(p, std::memory_order_relaxed);
+            }
+            tail_page.store(p, std::memory_order_relaxed);;
+        } else {
+            p = tail_page.load(std::memory_order_acquire); // TODO may be relaxed ?
+        }
+        return index;
+    }
+
+    template<typename... Args>
+    void push( ticket_type k, queue_rep_type& base, Args&&... args )
+    {
+        padded_page* p = nullptr;
+        page_allocator_type page_allocator(base.get_allocator());
+        size_type index = prepare_page(k, base, page_allocator, p);
+        __TBB_ASSERT(p != nullptr, "Page was not prepared");
+
+        // try_call API is not convenient here due to broken
+        // variadic capture on GCC 4.8.5
+        auto value_guard = make_raii_guard([&] {
+            ++base.n_invalid_entries;
+            call_itt_notify(releasing, &tail_counter);
+            tail_counter.fetch_add(queue_rep_type::n_queue);
+        });
+
+        page_allocator_traits::construct(page_allocator, &(*p)[index], std::forward<Args>(args)...);
+        // If no exception was thrown, mark item as present.
+        p->mask.store(p->mask.load(std::memory_order_relaxed) | uintptr_t(1) << index, std::memory_order_relaxed);
+        call_itt_notify(releasing, &tail_counter);
+
+        value_guard.dismiss();
+        tail_counter.fetch_add(queue_rep_type::n_queue);
+    }
+
+    void abort_push( ticket_type k, queue_rep_type& base) {
+        padded_page* p = nullptr;
+        prepare_page(k, base, base.get_allocator(), p);
+        ++base.n_invalid_entries;
+        tail_counter.fetch_add(queue_rep_type::n_queue);
+    }
+
+    bool pop( void* dst, ticket_type k, queue_rep_type& base ) {
+        k &= -queue_rep_type::n_queue;
+        if (head_counter.load(std::memory_order_relaxed) != k) spin_wait_until_eq(head_counter, k);
+        call_itt_notify(acquired, &head_counter);
+        if (tail_counter.load(std::memory_order_relaxed) == k) spin_wait_while_eq(tail_counter, k);
+        call_itt_notify(acquired, &tail_counter);
+        padded_page *p = head_page.load(std::memory_order_acquire);
+        __TBB_ASSERT( p, nullptr );
+        size_type index = modulo_power_of_two( k/queue_rep_type::n_queue, items_per_page );
+        bool success = false;
+        {
+            page_allocator_type page_allocator(base.get_allocator());
+            micro_queue_pop_finalizer<self_type, value_type, page_allocator_type> finalizer(*this, page_allocator,
+                k + queue_rep_type::n_queue, index == items_per_page - 1 ? p : nullptr );
+            if (p->mask.load(std::memory_order_relaxed) & (std::uintptr_t(1) << index)) {
+                success = true;
+                assign_and_destroy_item( dst, *p, index );
+            } else {
+                --base.n_invalid_entries;
+            }
+        }
+        return success;
+    }
+
+    micro_queue& assign( const micro_queue& src, queue_rep_type& base,
+        item_constructor_type construct_item )
+    {
+        head_counter.store(src.head_counter.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        tail_counter.store(src.tail_counter.load(std::memory_order_relaxed), std::memory_order_relaxed);
+
+        const padded_page* srcp = src.head_page.load(std::memory_order_relaxed);
+        if( is_valid_page(srcp) ) {
+            ticket_type g_index = head_counter.load(std::memory_order_relaxed);
+            size_type n_items  = (tail_counter.load(std::memory_order_relaxed) - head_counter.load(std::memory_order_relaxed))
+                / queue_rep_type::n_queue;
+            size_type index = modulo_power_of_two(head_counter.load(std::memory_order_relaxed) / queue_rep_type::n_queue, items_per_page);
+            size_type end_in_first_page = (index+n_items < items_per_page) ? (index + n_items) : items_per_page;
+
+            try_call( [&] {
+                head_page.store(make_copy(base, srcp, index, end_in_first_page, g_index, construct_item), std::memory_order_relaxed);
+            }).on_exception( [&] {
+                head_counter.store(0, std::memory_order_relaxed);
+                tail_counter.store(0, std::memory_order_relaxed);
+            });
+            padded_page* cur_page = head_page.load(std::memory_order_relaxed);
+
+            try_call( [&] {
+                if (srcp != src.tail_page.load(std::memory_order_relaxed)) {
+                    for (srcp = srcp->next; srcp != src.tail_page.load(std::memory_order_relaxed); srcp=srcp->next ) {
+                        cur_page->next = make_copy( base, srcp, 0, items_per_page, g_index, construct_item );
+                        cur_page = cur_page->next;
+                    }
+
+                    __TBB_ASSERT(srcp == src.tail_page.load(std::memory_order_relaxed), nullptr );
+                    size_type last_index = modulo_power_of_two(tail_counter.load(std::memory_order_relaxed) / queue_rep_type::n_queue, items_per_page);
+                    if( last_index==0 ) last_index = items_per_page;
+
+                    cur_page->next = make_copy( base, srcp, 0, last_index, g_index, construct_item );
+                    cur_page = cur_page->next;
+                }
+                tail_page.store(cur_page, std::memory_order_relaxed);
+            }).on_exception( [&] {
+                padded_page* invalid_page = reinterpret_cast<padded_page*>(std::uintptr_t(1));
+                tail_page.store(invalid_page, std::memory_order_relaxed);
+            });
+        } else {
+            head_page.store(nullptr, std::memory_order_relaxed);
+            tail_page.store(nullptr, std::memory_order_relaxed);
+        }
+        return *this;
+    }
+
+    padded_page* make_copy( queue_rep_type& base, const padded_page* src_page, size_type begin_in_page,
+        size_type end_in_page, ticket_type& g_index, item_constructor_type construct_item )
+    {
+        page_allocator_type page_allocator(base.get_allocator());
+        padded_page* new_page = page_allocator_traits::allocate(page_allocator, 1);
+        new_page->next = nullptr;
+        new_page->mask.store(src_page->mask.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        for (; begin_in_page!=end_in_page; ++begin_in_page, ++g_index) {
+            if (new_page->mask.load(std::memory_order_relaxed) & uintptr_t(1) << begin_in_page) {
+                copy_item(*new_page, begin_in_page, *src_page, begin_in_page, construct_item);
+            }
+        }
+        return new_page;
+    }
+
+    void invalidate_page( ticket_type k )  {
+        // Append an invalid page at address 1 so that no more pushes are allowed.
+        padded_page* invalid_page = reinterpret_cast<padded_page*>(std::uintptr_t(1));
+        {
+            spin_mutex::scoped_lock lock( page_mutex );
+            tail_counter.store(k + queue_rep_type::n_queue + 1, std::memory_order_relaxed);
+            padded_page* q = tail_page.load(std::memory_order_relaxed);
+            if (is_valid_page(q)) {
+                q->next = invalid_page;
+            } else {
+                head_page.store(invalid_page, std::memory_order_relaxed);
+            }
+            tail_page.store(invalid_page, std::memory_order_relaxed);
+        }
+    }
+
+    padded_page* get_tail_page() {
+        return tail_page.load(std::memory_order_relaxed);
+    }
+
+    padded_page* get_head_page() {
+        return head_page.load(std::memory_order_relaxed);
+    }
+
+    void set_tail_page( padded_page* pg ) {
+        tail_page.store(pg, std::memory_order_relaxed);
+    }
+
+    void clear(queue_rep_type& base) {
+        padded_page* curr_page = head_page.load(std::memory_order_relaxed);
+        std::size_t index = head_counter.load(std::memory_order_relaxed);
+        page_allocator_type page_allocator(base.get_allocator());
+
+        while (curr_page) {
+            for (; index != items_per_page - 1; ++index) {
+                curr_page->operator[](index).~value_type();
+            }
+                padded_page* next_page = curr_page->next;
+                page_allocator_traits::destroy(page_allocator, curr_page);
+                page_allocator_traits::deallocate(page_allocator, curr_page, 1);
+                curr_page = next_page;
+        }
+
+        padded_page* invalid_page = reinterpret_cast<padded_page*>(std::uintptr_t(1));
+        head_page.store(invalid_page, std::memory_order_relaxed);
+        tail_page.store(invalid_page, std::memory_order_relaxed);
+    }
+
+private:
+    // template <typename U, typename A>
+    friend class micro_queue_pop_finalizer<self_type, value_type, page_allocator_type>;
+
+    // Class used to ensure exception-safety of method "pop"
+    class destroyer  {
+        value_type& my_value;
+    public:
+        destroyer( reference value ) : my_value(value) {}
+        destroyer( const destroyer& ) = delete;
+        destroyer& operator=( const destroyer& ) = delete;
+        ~destroyer() {my_value.~T();}
+    }; // class destroyer
+
+    void copy_item( padded_page& dst, size_type dindex, const padded_page& src, size_type sindex,
+        item_constructor_type construct_item )
+    {
+        auto& src_item = src[sindex];
+        construct_item( &dst[dindex], static_cast<const void*>(&src_item) );
+    }
+
+    void assign_and_destroy_item( void* dst, padded_page& src, size_type index ) {
+        auto& from = src[index];
+        destroyer d(from);
+        *static_cast<T*>(dst) = std::move(from);
+    }
+
+    void spin_wait_until_my_turn( std::atomic<ticket_type>& counter, ticket_type k, queue_rep_type& rb ) const {
+        for (atomic_backoff b(true);; b.pause()) {
+            ticket_type c = counter;
+            if (c == k) return;
+            else if (c & 1) {
+                ++rb.n_invalid_entries;
+                throw_exception( exception_id::bad_last_alloc);
+            }
+        }
+    }
+
+    std::atomic<padded_page*> head_page{};
+    std::atomic<ticket_type> head_counter{};
+
+    std::atomic<padded_page*> tail_page{};
+    std::atomic<ticket_type> tail_counter{};
+
+    spin_mutex page_mutex{};
+}; // class micro_queue
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+#pragma warning( pop )
+#endif // warning 4146 is back
+
+template <typename Container, typename T, typename Allocator>
+class micro_queue_pop_finalizer {
+public:
+    using padded_page = typename Container::padded_page;
+    using allocator_type = Allocator;
+    using allocator_traits_type = tbb::detail::allocator_traits<allocator_type>;
+
+    micro_queue_pop_finalizer( Container& queue, Allocator& alloc, ticket_type k, padded_page* p ) :
+        my_ticket_type(k), my_queue(queue), my_page(p), allocator(alloc)
+    {}
+
+    micro_queue_pop_finalizer( const micro_queue_pop_finalizer& ) = delete;
+    micro_queue_pop_finalizer& operator=( const micro_queue_pop_finalizer& ) = delete;
+
+    ~micro_queue_pop_finalizer() {
+        padded_page* p = my_page;
+        if( is_valid_page(p) ) {
+            spin_mutex::scoped_lock lock( my_queue.page_mutex );
+            padded_page* q = p->next;
+            my_queue.head_page.store(q, std::memory_order_relaxed);
+            if( !is_valid_page(q) ) {
+                my_queue.tail_page.store(nullptr, std::memory_order_relaxed);
+            }
+        }
+        my_queue.head_counter.store(my_ticket_type, std::memory_order_relaxed);
+        if ( is_valid_page(p) ) {
+            allocator_traits_type::destroy(allocator, static_cast<padded_page*>(p));
+            allocator_traits_type::deallocate(allocator, static_cast<padded_page*>(p), 1);
+        }
+    }
+private:
+    ticket_type my_ticket_type;
+    Container& my_queue;
+    padded_page* my_page;
+    Allocator& allocator;
+}; // class micro_queue_pop_finalizer
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+// structure was padded due to alignment specifier
+#pragma warning( push )
+#pragma warning( disable: 4324 )
+#endif
+
+template <typename T, typename Allocator>
+struct concurrent_queue_rep {
+    using self_type = concurrent_queue_rep<T, Allocator>;
+    using size_type = std::size_t;
+    using micro_queue_type = micro_queue<T, Allocator>;
+    using allocator_type = Allocator;
+    using allocator_traits_type = tbb::detail::allocator_traits<allocator_type>;
+    using padded_page = typename micro_queue_type::padded_page;
+    using page_allocator_type = typename micro_queue_type::page_allocator_type;
+    using item_constructor_type = typename micro_queue_type::item_constructor_type;
+private:
+    using page_allocator_traits = tbb::detail::allocator_traits<page_allocator_type>;
+    using queue_allocator_type = typename allocator_traits_type::template rebind_alloc<self_type>;
+
+public:
+    // must be power of 2
+    static constexpr size_type n_queue = 8;
+    // Approximately n_queue/golden ratio
+    static constexpr size_type phi = 3;
+    static constexpr size_type item_size = micro_queue_type::item_size;
+    static constexpr size_type items_per_page = micro_queue_type::items_per_page;
+
+    concurrent_queue_rep( queue_allocator_type& alloc ) : my_queue_allocator(alloc)
+    {}
+
+    concurrent_queue_rep( const concurrent_queue_rep& ) = delete;
+    concurrent_queue_rep& operator=( const concurrent_queue_rep& ) = delete;
+
+    void clear() {
+        page_allocator_type page_allocator(my_queue_allocator);
+        for (size_type i = 0; i < n_queue; ++i) {
+            padded_page* tail_page = array[i].get_tail_page();
+            if( is_valid_page(tail_page) ) {
+                __TBB_ASSERT(array[i].get_head_page() == tail_page, "at most one page should remain" );
+                page_allocator_traits::destroy(page_allocator, static_cast<padded_page*>(tail_page));
+                page_allocator_traits::deallocate(page_allocator, static_cast<padded_page*>(tail_page), 1);
+                array[i].set_tail_page(nullptr);
+            } else {
+                __TBB_ASSERT(!is_valid_page(array[i].get_head_page()), "head page pointer corrupt?");
+            }
+        }
+    }
+
+    void assign( const concurrent_queue_rep& src, item_constructor_type construct_item ) {
+        head_counter.store(src.head_counter.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        tail_counter.store(src.tail_counter.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        n_invalid_entries.store(src.n_invalid_entries.load(std::memory_order_relaxed), std::memory_order_relaxed);
+
+        // copy or move micro_queues
+        size_type queue_idx = 0;
+        try_call( [&] {
+            for (; queue_idx < n_queue; ++queue_idx) {
+                array[queue_idx].assign(src.array[queue_idx], *this, construct_item);
+            }
+        }).on_exception( [&] {
+            for (size_type i = 0; i < queue_idx + 1; ++i) {
+                array[i].clear(*this);
+            }
+            head_counter.store(0, std::memory_order_relaxed);
+            tail_counter.store(0, std::memory_order_relaxed);
+            n_invalid_entries.store(0, std::memory_order_relaxed);
+        });
+
+        __TBB_ASSERT(head_counter.load(std::memory_order_relaxed) == src.head_counter.load(std::memory_order_relaxed) &&
+                     tail_counter.load(std::memory_order_relaxed) == src.tail_counter.load(std::memory_order_relaxed),
+                     "the source concurrent queue should not be concurrently modified." );
+    }
+
+    bool empty() const {
+        ticket_type tc = tail_counter.load(std::memory_order_acquire);
+        ticket_type hc = head_counter.load(std::memory_order_relaxed);
+        // if tc!=r.tail_counter, the queue was not empty at some point between the two reads.
+        return tc == tail_counter.load(std::memory_order_relaxed) &&
+               std::ptrdiff_t(tc - hc - n_invalid_entries.load(std::memory_order_relaxed)) <= 0;
+    }
+
+    std::ptrdiff_t size() const {
+        __TBB_ASSERT(sizeof(std::ptrdiff_t) <= sizeof(size_type), NULL);
+        std::ptrdiff_t hc = head_counter.load(std::memory_order_acquire);
+        std::ptrdiff_t tc = tail_counter.load(std::memory_order_relaxed);
+        std::ptrdiff_t nie = n_invalid_entries.load(std::memory_order_relaxed);
+
+        return tc - hc - nie;
+    }
+
+    queue_allocator_type& get_allocator() {
+        return my_queue_allocator;
+    }
+
+    friend class micro_queue<T, Allocator>;
+
+    // Map ticket_type to an array index
+    static size_type index( ticket_type k ) {
+        return k * phi % n_queue;
+    }
+
+    micro_queue_type& choose( ticket_type k ) {
+        // The formula here approximates LRU in a cache-oblivious way.
+        return array[index(k)];
+    }
+
+    alignas(max_nfs_size) micro_queue_type array[n_queue];
+
+    alignas(max_nfs_size) std::atomic<ticket_type> head_counter{};
+    alignas(max_nfs_size) std::atomic<ticket_type> tail_counter{};
+    alignas(max_nfs_size) std::atomic<size_type> n_invalid_entries{};
+    queue_allocator_type& my_queue_allocator;
+}; // class concurrent_queue_rep
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+#pragma warning( pop )
+#endif
+
+template <typename Value, typename Allocator>
+class concurrent_queue_iterator_base {
+    using queue_rep_type = concurrent_queue_rep<Value, Allocator>;
+    using padded_page = typename queue_rep_type::padded_page;
+protected:
+    concurrent_queue_iterator_base() = default;
+
+    concurrent_queue_iterator_base( const concurrent_queue_iterator_base& other ) {
+        assign(other);
+    }
+
+    concurrent_queue_iterator_base( queue_rep_type* queue_rep )
+        : my_queue_rep(queue_rep),
+          my_head_counter(my_queue_rep->head_counter.load(std::memory_order_relaxed))
+    {
+        for (std::size_t i = 0; i < queue_rep_type::n_queue; ++i) {
+            my_array[i] = my_queue_rep->array[i].get_head_page();
+        }
+
+        if (!get_item(my_item, my_head_counter)) advance();
+    }
+
+    void assign( const concurrent_queue_iterator_base& other ) {
+        my_item = other.my_item;
+        my_queue_rep = other.my_queue_rep;
+
+        if (my_queue_rep != nullptr) {
+            my_head_counter = other.my_head_counter;
+
+            for (std::size_t i = 0; i < queue_rep_type::n_queue; ++i) {
+                my_array[i] = other.my_array[i];
+            }
+        }
+    }
+
+    void advance() {
+        __TBB_ASSERT(my_item, "Attempt to increment iterator past end of the queue");
+        std::size_t k = my_head_counter;
+#if TBB_USE_ASSERT
+        Value* tmp;
+        get_item(tmp, k);
+        __TBB_ASSERT(my_item == tmp, nullptr);
+#endif
+        std::size_t i = modulo_power_of_two(k / queue_rep_type::n_queue, my_queue_rep->items_per_page);
+        if (i == my_queue_rep->items_per_page - 1) {
+            padded_page*& root = my_array[queue_rep_type::index(k)];
+            root = root->next;
+        }
+        // Advance k
+        my_head_counter = ++k;
+        if (!get_item(my_item, k)) advance();
+    }
+
+    concurrent_queue_iterator_base& operator=( const concurrent_queue_iterator_base& other ) {
+        this->assign(other);
+        return *this;
+    }
+
+    bool get_item( Value*& item, std::size_t k ) {
+        if (k == my_queue_rep->tail_counter.load(std::memory_order_relaxed)) {
+            item = nullptr;
+            return true;
+        } else {
+            padded_page* p = my_array[queue_rep_type::index(k)];
+            __TBB_ASSERT(p, nullptr);
+            std::size_t i = modulo_power_of_two(k / queue_rep_type::n_queue, my_queue_rep->items_per_page);
+            item = &(*p)[i];
+            return (p->mask & uintptr_t(1) << i) != 0;
+        }
+    }
+
+    Value* my_item{ nullptr };
+    queue_rep_type* my_queue_rep{ nullptr };
+    ticket_type my_head_counter{};
+    padded_page* my_array[queue_rep_type::n_queue];
+}; // class concurrent_queue_iterator_base
+
+struct concurrent_queue_iterator_provider {
+    template <typename Iterator, typename Container>
+    static Iterator get( const Container& container ) {
+        return Iterator(container);
+    }
+}; // struct concurrent_queue_iterator_provider
+
+template <typename Container, typename Value, typename Allocator>
+class concurrent_queue_iterator : public concurrent_queue_iterator_base<typename std::remove_cv<Value>::type, Allocator> {
+    using base_type = concurrent_queue_iterator_base<typename std::remove_cv<Value>::type, Allocator>;
+public:
+    using value_type = Value;
+    using pointer = value_type*;
+    using reference = value_type&;
+    using difference_type = std::ptrdiff_t;
+    using iterator_category = std::forward_iterator_tag;
+
+    concurrent_queue_iterator() = default;
+
+    /** If Value==Container::value_type, then this routine is the copy constructor.
+        If Value==const Container::value_type, then this routine is a conversion constructor. */
+    concurrent_queue_iterator( const concurrent_queue_iterator<Container, typename Container::value_type, Allocator>& other )
+        : base_type(other) {}
+
+private:
+    concurrent_queue_iterator( const Container& container )
+        : base_type(container.my_queue_representation) {}
+public:
+    concurrent_queue_iterator& operator=( const concurrent_queue_iterator<Container, typename Container::value_type, Allocator>& other ) {
+        this->assign(other);
+        return *this;
+    }
+
+    reference operator*() const {
+        return *static_cast<pointer>(this->my_item);
+    }
+
+    pointer operator->() const { return &operator*(); }
+
+    concurrent_queue_iterator& operator++() {
+        this->advance();
+        return *this;
+    }
+
+    concurrent_queue_iterator operator++(int) {
+        concurrent_queue_iterator tmp = *this;
+        ++*this;
+        return tmp;
+    }
+
+    friend bool operator==( const concurrent_queue_iterator& lhs, const concurrent_queue_iterator& rhs ) {
+        return lhs.my_item == rhs.my_item;
+    }
+
+    friend bool operator!=( const concurrent_queue_iterator& lhs, const concurrent_queue_iterator& rhs ) {
+        return lhs.my_item != rhs.my_item;
+    }
+private:
+    friend struct concurrent_queue_iterator_provider;
+}; // class concurrent_queue_iterator
+
+} // namespace d1
+} // namespace detail
+} // tbb
+
+#endif // __TBB_detail__concurrent_queue_base_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_concurrent_skip_list.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_concurrent_skip_list.h
new file mode 100644
index 0000000000..c4d4c627e0
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_concurrent_skip_list.h
@@ -0,0 +1,1252 @@
+/*
+    Copyright (c) 2019-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__concurrent_skip_list_H
+#define __TBB_detail__concurrent_skip_list_H
+
+#if !defined(__TBB_concurrent_map_H) && !defined(__TBB_concurrent_set_H)
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+#include "_config.h"
+#include "_range_common.h"
+#include "_allocator_traits.h"
+#include "_template_helpers.h"
+#include "_node_handle.h"
+#include "_containers_helpers.h"
+#include "_assert.h"
+#include "_exception.h"
+#include "../enumerable_thread_specific.h"
+#include <utility>
+#include <initializer_list>
+#include <atomic>
+#include <array>
+#include <type_traits>
+#include <random> // Need std::geometric_distribution
+#include <algorithm> // Need std::equal and std::lexicographical_compare
+#include <cstdint>
+#if __TBB_CPP20_COMPARISONS_PRESENT
+#include <compare>
+#endif
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#pragma warning(push)
+#pragma warning(disable: 4127) // warning C4127: conditional expression is constant
+#endif
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename Value, typename Allocator>
+class skip_list_node {
+    using node_ptr = skip_list_node*;
+public:
+    using value_type = Value;
+    using atomic_node_ptr = std::atomic<node_ptr>;
+    using size_type = std::size_t;
+    using container_allocator_type = Allocator;
+
+    using reference = value_type&;
+    using const_reference = const value_type&;
+private:
+    using allocator_traits = tbb::detail::allocator_traits<container_allocator_type>;
+
+    // Allocator is the same as the container allocator=> allocates unitptr_t
+    // It is required to rebind it to value_type to get the correct pointer and const_pointer
+    using value_allocator_traits = typename allocator_traits::template rebind_traits<value_type>;
+public:
+    using pointer = typename value_allocator_traits::pointer;
+    using const_pointer = typename value_allocator_traits::const_pointer;
+
+    skip_list_node( size_type levels, container_allocator_type& alloc )
+        : my_container_allocator(alloc), my_height(levels), my_index_number(0)
+    {
+        for (size_type l = 0; l < my_height; ++l) {
+            allocator_traits::construct(my_container_allocator, &get_atomic_next(l), nullptr);
+        }
+    }
+
+    ~skip_list_node() {
+        for (size_type l = 0; l < my_height; ++l) {
+            allocator_traits::destroy(my_container_allocator, &get_atomic_next(l));
+        }
+    }
+
+    skip_list_node( const skip_list_node& ) = delete;
+    skip_list_node( skip_list_node&& ) = delete;
+    skip_list_node& operator=( const skip_list_node& ) = delete;
+    skip_list_node& operator=( skip_list_node&& ) = delete;
+
+    pointer storage() {
+        return &my_value;
+    }
+
+    reference value() {
+        return *storage();
+    }
+
+    node_ptr next( size_type level ) const {
+        node_ptr res = get_atomic_next(level).load(std::memory_order_acquire);
+        __TBB_ASSERT(res == nullptr || res->height() > level, "Broken internal structure");
+        return res;
+    }
+
+    atomic_node_ptr& atomic_next( size_type level ) {
+        atomic_node_ptr& res = get_atomic_next(level);
+#if TBB_USE_DEBUG
+        node_ptr node = res.load(std::memory_order_acquire);
+        __TBB_ASSERT(node == nullptr || node->height() > level, "Broken internal structure");
+#endif
+        return res;
+    }
+
+    void set_next( size_type level, node_ptr n ) {
+        __TBB_ASSERT(n == nullptr || n->height() > level, "Broken internal structure");
+        get_atomic_next(level).store(n, std::memory_order_relaxed);
+    }
+
+    size_type height() const {
+        return my_height;
+    }
+
+    void set_index_number( size_type index_num ) {
+        my_index_number = index_num;
+    }
+
+    size_type index_number() const {
+        return my_index_number;
+    }
+
+private:
+    atomic_node_ptr& get_atomic_next( size_type level ) {
+        atomic_node_ptr* arr = reinterpret_cast<atomic_node_ptr*>(this + 1);
+        return arr[level];
+    }
+
+    const atomic_node_ptr& get_atomic_next( size_type level ) const {
+        const atomic_node_ptr* arr = reinterpret_cast<const atomic_node_ptr*>(this + 1);
+        return arr[level];
+    }
+
+    container_allocator_type& my_container_allocator;
+    union {
+        value_type my_value;
+    };
+    size_type my_height;
+    size_type my_index_number;
+}; // class skip_list_node
+
+template <typename NodeType, typename ValueType>
+class skip_list_iterator {
+    using node_type = NodeType;
+    using node_ptr = node_type*;
+public:
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = ValueType;
+
+    using difference_type = std::ptrdiff_t;
+    using pointer = value_type*;
+    using reference = value_type&;
+
+    skip_list_iterator() : skip_list_iterator(nullptr) {}
+
+    skip_list_iterator( const skip_list_iterator<node_type, typename node_type::value_type>& other )
+        : my_node_ptr(other.my_node_ptr) {}
+
+    skip_list_iterator& operator=( const skip_list_iterator<node_type, typename node_type::value_type>& other ) {
+        my_node_ptr = other.my_node_ptr;
+        return *this;
+    }
+
+    reference operator*() const { return my_node_ptr->value(); }
+    pointer operator->() const { return my_node_ptr->storage(); }
+
+    skip_list_iterator& operator++() {
+        __TBB_ASSERT(my_node_ptr != nullptr, nullptr);
+        my_node_ptr = my_node_ptr->next(0);
+        return *this;
+    }
+
+    skip_list_iterator operator++(int) {
+        skip_list_iterator tmp = *this;
+        ++*this;
+        return tmp;
+    }
+
+private:
+    skip_list_iterator(node_type* n) : my_node_ptr(n) {}
+
+    node_ptr my_node_ptr;
+
+    template <typename Traits>
+    friend class concurrent_skip_list;
+
+    template <typename N, typename V>
+    friend class skip_list_iterator;
+
+    friend class const_range;
+    friend class range;
+
+    friend bool operator==( const skip_list_iterator& lhs, const skip_list_iterator& rhs ) {
+        return lhs.my_node_ptr == rhs.my_node_ptr;
+    }
+
+    friend bool operator!=( const skip_list_iterator& lhs, const skip_list_iterator& rhs ) {
+        return lhs.my_node_ptr != rhs.my_node_ptr;
+    }
+}; // class skip_list_iterator
+
+template <typename Traits>
+class concurrent_skip_list {
+protected:
+    using container_traits = Traits;
+    using self_type = concurrent_skip_list<container_traits>;
+    using allocator_type = typename container_traits::allocator_type;
+    using allocator_traits_type = tbb::detail::allocator_traits<allocator_type>;
+    using key_compare = typename container_traits::compare_type;
+    using value_compare = typename container_traits::value_compare;
+    using key_type = typename container_traits::key_type;
+    using value_type = typename container_traits::value_type;
+    static_assert(std::is_same<value_type, typename allocator_type::value_type>::value,
+                  "value_type of the container should be the same as its allocator");
+
+    using size_type = std::size_t;
+    using difference_type = std::ptrdiff_t;
+
+    static constexpr size_type max_level = container_traits::max_level;
+
+    using node_allocator_type = typename allocator_traits_type::template rebind_alloc<std::uint8_t>;
+    using node_allocator_traits = tbb::detail::allocator_traits<node_allocator_type>;
+
+    using list_node_type = skip_list_node<value_type, node_allocator_type>;
+    using node_type = node_handle<key_type, value_type, list_node_type, allocator_type>;
+
+    using iterator = skip_list_iterator<list_node_type, value_type>;
+    using const_iterator = skip_list_iterator<list_node_type, const value_type>;
+
+    using reference = value_type&;
+    using const_reference = const value_type&;
+    using pointer = typename allocator_traits_type::pointer;
+    using const_pointer = typename allocator_traits_type::const_pointer;
+
+    using random_level_generator_type = typename container_traits::random_level_generator_type;
+
+    using node_ptr = list_node_type*;
+
+    using array_type = std::array<node_ptr, max_level>;
+private:
+    template <typename T>
+    using is_transparent = dependent_bool<comp_is_transparent<key_compare>, T>;
+public:
+    static constexpr bool allow_multimapping = container_traits::allow_multimapping;
+
+    concurrent_skip_list() : my_head_ptr(nullptr), my_size(0), my_max_height(0) {}
+
+    explicit concurrent_skip_list( const key_compare& comp, const allocator_type& alloc = allocator_type() )
+        : my_node_allocator(alloc), my_compare(comp), my_head_ptr(nullptr), my_size(0), my_max_height(0) {}
+
+    explicit concurrent_skip_list( const allocator_type& alloc )
+        : concurrent_skip_list(key_compare(), alloc) {}
+
+    template<typename InputIterator>
+    concurrent_skip_list( InputIterator first, InputIterator last, const key_compare& comp = key_compare(),
+                          const allocator_type& alloc = allocator_type() )
+        : concurrent_skip_list(comp, alloc)
+    {
+        internal_copy(first, last);
+    }
+
+    template <typename InputIterator>
+    concurrent_skip_list( InputIterator first, InputIterator last, const allocator_type& alloc )
+        : concurrent_skip_list(first, last, key_compare(), alloc) {}
+
+    concurrent_skip_list( std::initializer_list<value_type> init, const key_compare& comp = key_compare(),
+                          const allocator_type& alloc = allocator_type() )
+        : concurrent_skip_list(init.begin(), init.end(), comp, alloc) {}
+
+    concurrent_skip_list( std::initializer_list<value_type> init, const allocator_type& alloc )
+        : concurrent_skip_list(init, key_compare(), alloc) {}
+
+    concurrent_skip_list( const concurrent_skip_list& other )
+        : my_node_allocator(node_allocator_traits::select_on_container_copy_construction(other.get_allocator())),
+          my_compare(other.my_compare), my_rng(other.my_rng), my_head_ptr(nullptr),
+          my_size(0), my_max_height(0)
+    {
+        internal_copy(other);
+        __TBB_ASSERT(my_size == other.my_size, "Wrong size of copy-constructed container");
+    }
+
+    concurrent_skip_list( const concurrent_skip_list& other, const allocator_type& alloc )
+        : my_node_allocator(alloc), my_compare(other.my_compare), my_rng(other.my_rng), my_head_ptr(nullptr),
+          my_size(0), my_max_height(0)
+    {
+        internal_copy(other);
+        __TBB_ASSERT(my_size == other.my_size, "Wrong size of copy-constructed container");
+    }
+
+    concurrent_skip_list( concurrent_skip_list&& other )
+        : my_node_allocator(std::move(other.my_node_allocator)), my_compare(other.my_compare),
+          my_rng(std::move(other.my_rng)), my_head_ptr(nullptr) // my_head_ptr would be stored in internal_move
+    {
+        internal_move(std::move(other));
+    }
+
+    concurrent_skip_list( concurrent_skip_list&& other, const allocator_type& alloc )
+        : my_node_allocator(alloc), my_compare(other.my_compare),
+          my_rng(std::move(other.my_rng)), my_head_ptr(nullptr)
+    {
+        using is_always_equal = typename allocator_traits_type::is_always_equal;
+        internal_move_construct_with_allocator(std::move(other), is_always_equal());
+    }
+
+    ~concurrent_skip_list() {
+        clear();
+        node_ptr head = my_head_ptr.load(std::memory_order_relaxed);
+        if (head != nullptr) {
+            delete_node(head);
+        }
+    }
+
+    concurrent_skip_list& operator=( const concurrent_skip_list& other ) {
+        if (this != &other) {
+            clear();
+            copy_assign_allocators(my_node_allocator, other.my_node_allocator);
+            my_compare = other.my_compare;
+            my_rng = other.my_rng;
+            internal_copy(other);
+        }
+        return *this;
+    }
+
+    concurrent_skip_list& operator=( concurrent_skip_list&& other ) {
+        if (this != &other) {
+            clear();
+            my_compare = std::move(other.my_compare);
+            my_rng = std::move(other.my_rng);
+
+            move_assign_allocators(my_node_allocator, other.my_node_allocator);
+            using pocma_type = typename node_allocator_traits::propagate_on_container_move_assignment;
+            using is_always_equal = typename node_allocator_traits::is_always_equal;
+            internal_move_assign(std::move(other), tbb::detail::disjunction<pocma_type, is_always_equal>());
+        }
+        return *this;
+    }
+
+    concurrent_skip_list& operator=( std::initializer_list<value_type> il )
+    {
+        clear();
+        insert(il.begin(),il.end());
+        return *this;
+    }
+
+    std::pair<iterator, bool> insert( const value_type& value ) {
+        return internal_insert(value);
+    }
+
+    std::pair<iterator, bool> insert( value_type&& value ) {
+        return internal_insert(std::move(value));
+    }
+
+    iterator insert( const_iterator, const_reference value ) {
+        // Ignore hint
+        return insert(value).first;
+    }
+
+    iterator insert( const_iterator, value_type&& value ) {
+        // Ignore hint
+        return insert(std::move(value)).first;
+    }
+
+    template<typename InputIterator>
+    void insert( InputIterator first, InputIterator last ) {
+        while (first != last) {
+            insert(*first);
+            ++first;
+        }
+    }
+
+    void insert( std::initializer_list<value_type> init ) {
+        insert(init.begin(), init.end());
+    }
+
+    std::pair<iterator, bool> insert( node_type&& nh ) {
+        if (!nh.empty()) {
+            auto insert_node = node_handle_accessor::get_node_ptr(nh);
+            std::pair<iterator, bool> insert_result = internal_insert_node(insert_node);
+            if (insert_result.second) {
+                node_handle_accessor::deactivate(nh);
+            }
+            return insert_result;
+        }
+        return std::pair<iterator, bool>(end(), false);
+    }
+
+    iterator insert( const_iterator, node_type&& nh ) {
+        // Ignore hint
+        return insert(std::move(nh)).first;
+    }
+
+    template<typename... Args>
+    std::pair<iterator, bool> emplace( Args&&... args ) {
+        return internal_insert(std::forward<Args>(args)...);
+    }
+
+    template<typename... Args>
+    iterator emplace_hint( const_iterator, Args&&... args ) {
+        // Ignore hint
+        return emplace(std::forward<Args>(args)...).first;
+    }
+
+    iterator unsafe_erase( iterator pos ) {
+        std::pair<node_ptr, node_ptr> extract_result = internal_extract(pos);
+        if (extract_result.first) { // node was extracted
+            delete_value_node(extract_result.first);
+            return extract_result.second;
+        }
+        return end();
+    }
+
+    iterator unsafe_erase( const_iterator pos ) {
+        return unsafe_erase(get_iterator(pos));
+    }
+
+    iterator unsafe_erase( const_iterator first, const_iterator last ) {
+        while (first != last) {
+            // Unsafe erase returns the iterator which follows the erased one
+            first = unsafe_erase(first);
+        }
+        return get_iterator(first);
+    }
+
+    size_type unsafe_erase( const key_type& key ) {
+        return internal_erase(key);
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value
+                            && !std::is_convertible<K, const_iterator>::value
+                            && !std::is_convertible<K, iterator>::value,
+                            size_type>::type unsafe_erase( const K& key )
+    {
+        return internal_erase(key);
+    }
+
+    node_type unsafe_extract( const_iterator pos ) {
+        std::pair<node_ptr, node_ptr> extract_result = internal_extract(pos);
+        return extract_result.first ? node_handle_accessor::construct<node_type>(extract_result.first) : node_type();
+    }
+
+    node_type unsafe_extract( iterator pos ) {
+        return unsafe_extract(const_iterator(pos));
+    }
+
+    node_type unsafe_extract( const key_type& key ) {
+        return unsafe_extract(find(key));
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value
+                            && !std::is_convertible<K, const_iterator>::value
+                            && !std::is_convertible<K, iterator>::value,
+                            node_type>::type unsafe_extract( const K& key )
+    {
+        return unsafe_extract(find(key));
+    }
+
+    iterator lower_bound( const key_type& key ) {
+        return iterator(internal_get_bound(key, my_compare));
+    }
+
+    const_iterator lower_bound( const key_type& key ) const {
+        return const_iterator(internal_get_bound(key, my_compare));
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, iterator>::type lower_bound( const K& key ) {
+        return iterator(internal_get_bound(key, my_compare));
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, const_iterator>::type lower_bound( const K& key ) const {
+        return const_iterator(internal_get_bound(key, my_compare));
+    }
+
+    iterator upper_bound( const key_type& key ) {
+        return iterator(internal_get_bound(key, not_greater_compare(my_compare)));
+    }
+
+    const_iterator upper_bound( const key_type& key ) const {
+        return const_iterator(internal_get_bound(key, not_greater_compare(my_compare)));
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, iterator>::type upper_bound( const K& key ) {
+        return iterator(internal_get_bound(key, not_greater_compare(my_compare)));
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, const_iterator>::type upper_bound( const K& key ) const {
+        return const_iterator(internal_get_bound(key, not_greater_compare(my_compare)));
+    }
+
+    iterator find( const key_type& key ) {
+        return iterator(internal_find(key));
+    }
+
+    const_iterator find( const key_type& key ) const {
+        return const_iterator(internal_find(key));
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, iterator>::type find( const K& key ) {
+        return iterator(internal_find(key));
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, const_iterator>::type find( const K& key ) const {
+        return const_iterator(internal_find(key));
+    }
+
+    size_type count( const key_type& key ) const {
+        return internal_count(key);
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, size_type>::type count( const K& key ) const {
+        return internal_count(key);
+    }
+
+    bool contains( const key_type& key ) const {
+        return find(key) != end();
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, bool>::type contains( const K& key ) const {
+        return find(key) != end();
+    }
+
+    void clear() noexcept {
+        // clear is not thread safe - load can be relaxed
+        node_ptr head = my_head_ptr.load(std::memory_order_relaxed);
+
+        if (head == nullptr) return; // Head is not allocated => container is empty
+
+        node_ptr current = head->next(0);
+
+        // Delete all value nodes in the container
+        while (current) {
+            node_ptr next = current->next(0);
+            delete_value_node(current);
+            current = next;
+        }
+
+        for (size_type level = 0; level < head->height(); ++level) {
+            head->set_next(level, nullptr);
+        }
+
+        my_size.store(0, std::memory_order_relaxed);
+        my_max_height.store(0, std::memory_order_relaxed);
+    }
+
+    iterator begin() {
+        return iterator(internal_begin());
+    }
+
+    const_iterator begin() const {
+        return const_iterator(internal_begin());
+    }
+
+    const_iterator cbegin() const {
+        return const_iterator(internal_begin());
+    }
+
+    iterator end() {
+        return iterator(nullptr);
+    }
+
+    const_iterator end() const {
+        return const_iterator(nullptr);
+    }
+
+    const_iterator cend() const {
+        return const_iterator(nullptr);
+    }
+
+    size_type size() const {
+        return my_size.load(std::memory_order_relaxed);
+    }
+
+    size_type max_size() const {
+        return node_allocator_traits::max_size(my_node_allocator);
+    }
+
+    __TBB_nodiscard bool empty() const {
+        return 0 == size();
+    }
+
+    allocator_type get_allocator() const {
+        return my_node_allocator;
+    }
+
+    void swap(concurrent_skip_list& other) {
+        if (this != &other) {
+            using pocs_type = typename node_allocator_traits::propagate_on_container_swap;
+            using is_always_equal = typename node_allocator_traits::is_always_equal;
+            internal_swap(other, tbb::detail::disjunction<pocs_type, is_always_equal>());
+        }
+    }
+
+    std::pair<iterator, iterator> equal_range(const key_type& key) {
+        return internal_equal_range(key);
+    }
+
+    std::pair<const_iterator, const_iterator> equal_range(const key_type& key) const {
+        return internal_equal_range(key);
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, std::pair<iterator, iterator>>::type equal_range( const K& key ) {
+        return internal_equal_range(key);
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, std::pair<const_iterator, const_iterator>>::type equal_range( const K& key ) const {
+        return internal_equal_range(key);
+    }
+
+    key_compare key_comp() const { return my_compare; }
+
+    value_compare value_comp() const { return container_traits::value_comp(my_compare); }
+
+    class const_range_type {
+    public:
+        using size_type = typename concurrent_skip_list::size_type;
+        using value_type = typename concurrent_skip_list::value_type;
+        using iterator = typename concurrent_skip_list::const_iterator;
+
+        bool empty() const {
+            return my_begin.my_node_ptr->next(0) == my_end.my_node_ptr;
+        }
+
+        bool is_divisible() const {
+            return my_level != 0 ? my_begin.my_node_ptr->next(my_level - 1) != my_end.my_node_ptr : false;
+        }
+
+        size_type size() const { return std::distance(my_begin, my_end); }
+
+        const_range_type( const_range_type& r, split)
+            : my_end(r.my_end) {
+            my_begin = iterator(r.my_begin.my_node_ptr->next(r.my_level - 1));
+            my_level = my_begin.my_node_ptr->height();
+            r.my_end = my_begin;
+        }
+
+        const_range_type( const concurrent_skip_list& l)
+            : my_end(l.end()), my_begin(l.begin()), my_level(my_begin.my_node_ptr->height() ) {}
+
+        iterator begin() const { return my_begin; }
+        iterator end() const { return my_end; }
+        size_type grainsize() const { return 1; }
+
+    private:
+        const_iterator my_end;
+        const_iterator my_begin;
+        size_type my_level;
+    }; // class const_range_type
+
+    class range_type : public const_range_type {
+    public:
+        using iterator = typename concurrent_skip_list::iterator;
+
+        range_type(range_type& r, split) : const_range_type(r, split()) {}
+        range_type(const concurrent_skip_list& l) : const_range_type(l) {}
+
+        iterator begin() const {
+            node_ptr node = const_range_type::begin().my_node_ptr;
+            return iterator(node);
+        }
+
+        iterator end() const {
+            node_ptr node = const_range_type::end().my_node_ptr;
+            return iterator(node);
+        }
+    }; // class range_type
+
+    range_type range() { return range_type(*this); }
+    const_range_type range() const { return const_range_type(*this); }
+
+private:
+    node_ptr internal_begin() const {
+        node_ptr head = get_head();
+        return head == nullptr ? head : head->next(0);
+    }
+
+    void internal_move(concurrent_skip_list&& other) {
+        my_head_ptr.store(other.my_head_ptr.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        other.my_head_ptr.store(nullptr, std::memory_order_relaxed);
+
+        my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        other.my_size.store(0, std::memory_order_relaxed);
+
+        my_max_height.store(other.my_max_height.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        other.my_max_height.store(0, std::memory_order_relaxed);
+    }
+
+    void internal_move_construct_with_allocator(concurrent_skip_list&& other,
+                                                /*is_always_equal = */std::true_type) {
+        internal_move(std::move(other));
+    }
+
+    void internal_move_construct_with_allocator(concurrent_skip_list&& other,
+                                                /*is_always_equal = */std::false_type) {
+        if (my_node_allocator == other.get_allocator()) {
+            internal_move(std::move(other));
+        } else {
+            my_size.store(0, std::memory_order_relaxed);
+            my_max_height.store(other.my_max_height.load(std::memory_order_relaxed), std::memory_order_relaxed);
+            internal_copy(std::make_move_iterator(other.begin()), std::make_move_iterator(other.end()));
+        }
+    }
+
+    static const key_type& get_key( node_ptr n ) {
+        __TBB_ASSERT(n, nullptr);
+        return container_traits::get_key(static_cast<node_ptr>(n)->value());
+    }
+
+    template <typename K>
+    bool found( node_ptr node, const K& key ) const {
+        return node != nullptr && !my_compare(key, get_key(node));
+    }
+
+    template <typename K>
+    node_ptr internal_find(const K& key) const {
+        return allow_multimapping ? internal_find_multi(key) : internal_find_unique(key);
+    }
+
+    template <typename K>
+    node_ptr internal_find_multi( const K& key ) const {
+        node_ptr prev = get_head();
+        if (prev == nullptr) return nullptr; // If the head node is not allocated - exit
+
+        node_ptr curr = nullptr;
+        node_ptr old_curr = curr;
+
+        for (size_type h = my_max_height.load(std::memory_order_acquire); h > 0; --h) {
+            curr = internal_find_position(h - 1, prev, key, my_compare);
+
+            if (curr != old_curr && found(curr, key)) {
+                return curr;
+            }
+            old_curr = curr;
+        }
+        return nullptr;
+    }
+
+    template <typename K>
+    node_ptr internal_find_unique( const K& key ) const {
+        const_iterator it = lower_bound(key);
+        return (it == end() || my_compare(key, container_traits::get_key(*it))) ? nullptr : it.my_node_ptr;
+    }
+
+    template <typename K>
+    size_type internal_count( const K& key ) const {
+        if (allow_multimapping) {
+            // TODO: reimplement without double traversal
+            std::pair<const_iterator, const_iterator> r = equal_range(key);
+            return std::distance(r.first, r.second);
+        }
+        return size_type(contains(key) ? 1 : 0);
+    }
+
+    template <typename K>
+    std::pair<iterator, iterator> internal_equal_range(const K& key) const {
+        iterator lb = get_iterator(lower_bound(key));
+        auto result = std::make_pair(lb, lb);
+
+        // If the lower bound points to the node with the requested key
+        if (found(lb.my_node_ptr, key)) {
+
+            if (!allow_multimapping) {
+                // For unique containers - move the second iterator forward and exit
+                ++result.second;
+            } else {
+                // For multi containers - find the upper bound starting from the lower bound
+                node_ptr prev = lb.my_node_ptr;
+                node_ptr curr = nullptr;
+                not_greater_compare cmp(my_compare);
+
+                // Start from the lower bound of the range
+                for (size_type h = prev->height(); h > 0; --h) {
+                    curr = prev->next(h - 1);
+                    while (curr && cmp(get_key(curr), key)) {
+                        prev = curr;
+                        // If the height of the next node is greater than the current one - jump to its height
+                        if (h < curr->height()) {
+                            h = curr->height();
+                        }
+                        curr = prev->next(h - 1);
+                    }
+                }
+                result.second = iterator(curr);
+            }
+        }
+
+        return result;
+    }
+
+    // Finds position on the level using comparator cmp starting from the node prev
+    template <typename K, typename Comparator>
+    node_ptr internal_find_position( size_type level, node_ptr& prev, const K& key,
+                                     const Comparator& cmp ) const {
+        __TBB_ASSERT(level < prev->height(), "Wrong level to find position");
+        node_ptr curr = prev->next(level);
+
+        while (curr && cmp(get_key(curr), key)) {
+            prev = curr;
+            __TBB_ASSERT(level < prev->height(), nullptr);
+            curr = prev->next(level);
+        }
+
+        return curr;
+    }
+
+    // The same as previous overload, but allows index_number comparison
+    template <typename Comparator>
+    node_ptr internal_find_position( size_type level, node_ptr& prev, node_ptr node,
+                                     const Comparator& cmp ) const {
+        __TBB_ASSERT(level < prev->height(), "Wrong level to find position");
+        node_ptr curr = prev->next(level);
+
+        while (curr && cmp(get_key(curr), get_key(node))) {
+            if (allow_multimapping && cmp(get_key(node), get_key(curr)) && curr->index_number() > node->index_number()) {
+                break;
+            }
+
+            prev = curr;
+            __TBB_ASSERT(level < prev->height(), nullptr);
+            curr = prev->next(level);
+        }
+        return curr;
+    }
+
+    template <typename Comparator>
+    void fill_prev_curr_arrays(array_type& prev_nodes, array_type& curr_nodes, node_ptr node, const key_type& key,
+                               const Comparator& cmp, node_ptr head ) {
+
+        size_type curr_max_height = my_max_height.load(std::memory_order_acquire);
+        size_type node_height = node->height();
+        if (curr_max_height < node_height) {
+            std::fill(prev_nodes.begin() + curr_max_height, prev_nodes.begin() + node_height, head);
+            std::fill(curr_nodes.begin() + curr_max_height, curr_nodes.begin() + node_height, nullptr);
+        }
+
+        node_ptr prev = head;
+        for (size_type level = curr_max_height; level > 0; --level) {
+            node_ptr curr = internal_find_position(level - 1, prev, key, cmp);
+            prev_nodes[level - 1] = prev;
+            curr_nodes[level - 1] = curr;
+        }
+    }
+
+    void fill_prev_array_for_existing_node( array_type& prev_nodes, node_ptr node ) {
+        node_ptr head = create_head_if_necessary();
+        prev_nodes.fill(head);
+
+        node_ptr prev = head;
+        for (size_type level = node->height(); level > 0; --level) {
+            while (prev->next(level - 1) != node) {
+                prev = prev->next(level - 1);
+            }
+            prev_nodes[level - 1] = prev;
+        }
+    }
+
+    struct not_greater_compare {
+        const key_compare& my_less_compare;
+
+        not_greater_compare( const key_compare& less_compare ) : my_less_compare(less_compare) {}
+
+        template <typename K1, typename K2>
+        bool operator()( const K1& first, const K2& second ) const {
+            return !my_less_compare(second, first);
+        }
+    };
+
+    not_greater_compare select_comparator( /*allow_multimapping = */ std::true_type ) {
+        return not_greater_compare(my_compare);
+    }
+
+    key_compare select_comparator( /*allow_multimapping = */ std::false_type ) {
+        return my_compare;
+    }
+
+    template<typename... Args>
+    std::pair<iterator, bool> internal_insert( Args&&... args ) {
+        node_ptr new_node = create_value_node(std::forward<Args>(args)...);
+        std::pair<iterator, bool> insert_result = internal_insert_node(new_node);
+        if (!insert_result.second) {
+            delete_value_node(new_node);
+        }
+        return insert_result;
+    }
+
+    std::pair<iterator, bool> internal_insert_node( node_ptr new_node ) {
+        array_type prev_nodes;
+        array_type curr_nodes;
+        size_type new_height = new_node->height();
+        auto compare = select_comparator(std::integral_constant<bool, allow_multimapping>{});
+
+        node_ptr head_node = create_head_if_necessary();
+
+        for (;;) {
+            fill_prev_curr_arrays(prev_nodes, curr_nodes, new_node, get_key(new_node), compare, head_node);
+
+            node_ptr prev = prev_nodes[0];
+            node_ptr next = curr_nodes[0];
+
+            if (allow_multimapping) {
+                new_node->set_index_number(prev->index_number() + 1);
+            } else {
+                if (found(next, get_key(new_node))) {
+                    return std::pair<iterator, bool>(iterator(next), false);
+                }
+            }
+
+            new_node->set_next(0, next);
+            if (!prev->atomic_next(0).compare_exchange_strong(next, new_node)) {
+                continue;
+            }
+
+            // If the node was successfully linked on the first level - it will be linked on other levels
+            // Insertion cannot fail starting from this point
+
+            // If the height of inserted node is greater than maximum - increase maximum
+            size_type max_height = my_max_height.load(std::memory_order_acquire);
+            for (;;) {
+                if (new_height <= max_height || my_max_height.compare_exchange_strong(max_height, new_height)) {
+                    // If the maximum was successfully updated by current thread
+                    // or by an other thread for the value, greater or equal to new_height
+                    break;
+                }
+            }
+
+            for (std::size_t level = 1; level < new_height; ++level) {
+                // Link the node on upper levels
+                for (;;) {
+                    prev = prev_nodes[level];
+                    next = static_cast<node_ptr>(curr_nodes[level]);
+
+                    new_node->set_next(level, next);
+                    __TBB_ASSERT(new_node->height() > level, "Internal structure break");
+                    if (prev->atomic_next(level).compare_exchange_strong(next, new_node)) {
+                        break;
+                    }
+
+                    for (size_type lev = level; lev != new_height; ++lev ) {
+                        curr_nodes[lev] = internal_find_position(lev, prev_nodes[lev], new_node, compare);
+                    }
+                }
+            }
+            ++my_size;
+            return std::pair<iterator, bool>(iterator(new_node), true);
+        }
+    }
+
+    template <typename K, typename Comparator>
+    node_ptr internal_get_bound( const K& key, const Comparator& cmp ) const {
+        node_ptr prev = get_head();
+        if (prev == nullptr) return nullptr; // If the head node is not allocated - exit
+
+        node_ptr curr = nullptr;
+
+        for (size_type h = my_max_height.load(std::memory_order_acquire); h > 0; --h) {
+            curr = internal_find_position(h - 1, prev, key, cmp);
+        }
+
+        return curr;
+    }
+
+    template <typename K>
+    size_type internal_erase( const K& key ) {
+        auto eq = equal_range(key);
+        size_type old_size = size();
+        unsafe_erase(eq.first, eq.second);
+        return old_size - size();
+    }
+
+    // Returns node_ptr to the extracted node and node_ptr to the next node after the extracted
+    std::pair<node_ptr, node_ptr> internal_extract( const_iterator it ) {
+        std::pair<node_ptr, node_ptr> result(nullptr, nullptr);
+        if ( it != end() ) {
+            array_type prev_nodes;
+
+            node_ptr erase_node = it.my_node_ptr;
+            node_ptr next_node = erase_node->next(0);
+            fill_prev_array_for_existing_node(prev_nodes, erase_node);
+
+            for (size_type level = 0; level < erase_node->height(); ++level) {
+                prev_nodes[level]->set_next(level, erase_node->next(level));
+                erase_node->set_next(level, nullptr);
+            }
+            my_size.fetch_sub(1, std::memory_order_relaxed);
+
+            result.first = erase_node;
+            result.second = next_node;
+        }
+        return result;
+    }
+
+protected:
+    template<typename SourceType>
+    void internal_merge( SourceType&& source ) {
+        using source_type = typename std::decay<SourceType>::type;
+        using source_iterator = typename source_type::iterator;
+        static_assert((std::is_same<node_type, typename source_type::node_type>::value), "Incompatible containers cannot be merged");
+
+        for (source_iterator it = source.begin(); it != source.end();) {
+            source_iterator where = it++;
+            if (allow_multimapping || !contains(container_traits::get_key(*where))) {
+                node_type handle = source.unsafe_extract(where);
+                __TBB_ASSERT(!handle.empty(), "Extracted handle in merge is empty");
+
+                if (!insert(std::move(handle)).second) {
+                    //If the insertion fails - return the node into source
+                    source.insert(std::move(handle));
+                }
+                __TBB_ASSERT(handle.empty(), "Node handle should be empty after the insertion");
+            }
+        }
+    }
+
+private:
+    void internal_copy( const concurrent_skip_list& other ) {
+        internal_copy(other.begin(), other.end());
+    }
+
+    template<typename Iterator>
+    void internal_copy( Iterator first, Iterator last ) {
+        try_call([&] {
+            for (auto it = first; it != last; ++it) {
+                insert(*it);
+            }
+        }).on_exception([&] {
+            clear();
+            node_ptr head = my_head_ptr.load(std::memory_order_relaxed);
+            if (head != nullptr) {
+                delete_node(head);
+            }
+        });
+    }
+
+    static size_type calc_node_size( size_type height ) {
+        static_assert(alignof(list_node_type) >= alignof(typename list_node_type::atomic_node_ptr), "Incorrect alignment");
+        return sizeof(list_node_type) + height * sizeof(typename list_node_type::atomic_node_ptr);
+    }
+
+    node_ptr create_node( size_type height ) {
+        size_type sz = calc_node_size(height);
+        node_ptr node = reinterpret_cast<node_ptr>(node_allocator_traits::allocate(my_node_allocator, sz));
+        node_allocator_traits::construct(my_node_allocator, node, height, my_node_allocator);
+        return node;
+    }
+
+    template <typename... Args>
+    node_ptr create_value_node( Args&&... args ) {
+        node_ptr node = create_node(my_rng());
+
+        // try_call API is not convenient here due to broken
+        // variadic capture on GCC 4.8.5
+        auto value_guard = make_raii_guard([&] {
+            delete_node(node);
+        });
+
+        // Construct the value inside the node
+        node_allocator_traits::construct(my_node_allocator, node->storage(), std::forward<Args>(args)...);
+        value_guard.dismiss();
+        return node;
+    }
+
+    node_ptr create_head_node() {
+        return create_node(max_level);
+    }
+
+    void delete_node( node_ptr node ) {
+        size_type sz = calc_node_size(node->height());
+
+        // Destroy the node
+        node_allocator_traits::destroy(my_node_allocator, node);
+        // Deallocate the node
+        node_allocator_traits::deallocate(my_node_allocator, reinterpret_cast<std::uint8_t*>(node), sz);
+    }
+
+    void delete_value_node( node_ptr node ) {
+        // Destroy the value inside the node
+        node_allocator_traits::destroy(my_node_allocator, node->storage());
+        delete_node(node);
+    }
+
+    node_ptr get_head() const {
+        return my_head_ptr.load(std::memory_order_acquire);
+    }
+
+    node_ptr create_head_if_necessary() {
+        node_ptr current_head = get_head();
+        if (current_head == nullptr) {
+            // Head node was not created - create it
+            node_ptr new_head = create_head_node();
+            if (my_head_ptr.compare_exchange_strong(current_head, new_head)) {
+                current_head = new_head;
+            } else {
+                // If an other thread has already created the head node - destroy new_head
+                // current_head now points to the actual head node
+                delete_node(new_head);
+            }
+        }
+        __TBB_ASSERT(my_head_ptr.load(std::memory_order_relaxed) != nullptr, nullptr);
+        __TBB_ASSERT(current_head != nullptr, nullptr);
+        return current_head;
+    }
+
+    static iterator get_iterator( const_iterator it ) {
+        return iterator(it.my_node_ptr);
+    }
+
+    void internal_move_assign( concurrent_skip_list&& other, /*POCMA || is_always_equal =*/std::true_type ) {
+        internal_move(std::move(other));
+    }
+
+    void internal_move_assign( concurrent_skip_list&& other, /*POCMA || is_always_equal =*/std::false_type ) {
+        if (my_node_allocator == other.my_node_allocator) {
+            internal_move(std::move(other));
+        } else {
+            internal_copy(std::make_move_iterator(other.begin()), std::make_move_iterator(other.end()));
+        }
+    }
+
+    void internal_swap_fields( concurrent_skip_list& other ) {
+        using std::swap;
+        swap_allocators(my_node_allocator, other.my_node_allocator);
+        swap(my_compare, other.my_compare);
+        swap(my_rng, other.my_rng);
+
+        swap_atomics_relaxed(my_head_ptr, other.my_head_ptr);
+        swap_atomics_relaxed(my_size, other.my_size);
+        swap_atomics_relaxed(my_max_height, other.my_max_height);
+    }
+
+    void internal_swap( concurrent_skip_list& other, /*POCMA || is_always_equal =*/std::true_type ) {
+        internal_swap_fields(other);
+    }
+
+    void internal_swap( concurrent_skip_list& other, /*POCMA || is_always_equal =*/std::false_type ) {
+        __TBB_ASSERT(my_node_allocator == other.my_node_allocator, "Swapping with unequal allocators is not allowed");
+        internal_swap_fields(other);
+    }
+
+    node_allocator_type my_node_allocator;
+    key_compare my_compare;
+    random_level_generator_type my_rng;
+    std::atomic<list_node_type*> my_head_ptr;
+    std::atomic<size_type> my_size;
+    std::atomic<size_type> my_max_height;
+
+    template<typename OtherTraits>
+    friend class concurrent_skip_list;
+}; // class concurrent_skip_list
+
+template <typename Traits>
+bool operator==( const concurrent_skip_list<Traits>& lhs, const concurrent_skip_list<Traits>& rhs ) {
+    if (lhs.size() != rhs.size()) return false;
+#if _MSC_VER
+    // Passing "unchecked" iterators to std::equal with 3 parameters
+    // causes compiler warnings.
+    // The workaround is to use overload with 4 parameters, which is
+    // available since C++14 - minimally supported version on MSVC
+    return std::equal(lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
+#else
+    return std::equal(lhs.begin(), lhs.end(), rhs.begin());
+#endif
+}
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+template <typename Traits>
+bool operator!=( const concurrent_skip_list<Traits>& lhs, const concurrent_skip_list<Traits>& rhs ) {
+    return !(lhs == rhs);
+}
+#endif
+
+#if __TBB_CPP20_COMPARISONS_PRESENT && __TBB_CPP20_CONCEPTS_PRESENT
+template <typename Traits>
+tbb::detail::synthesized_three_way_result<typename Traits::value_type>
+operator<=>( const concurrent_skip_list<Traits>& lhs, const concurrent_skip_list<Traits>& rhs ) {
+    return std::lexicographical_compare_three_way(lhs.begin(), lhs.end(),
+                                                  rhs.begin(), rhs.end(),
+                                                  tbb::detail::synthesized_three_way_comparator{});
+}
+#else
+template <typename Traits>
+bool operator<( const concurrent_skip_list<Traits>& lhs, const concurrent_skip_list<Traits>& rhs ) {
+    return std::lexicographical_compare(lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
+}
+
+template <typename Traits>
+bool operator>( const concurrent_skip_list<Traits>& lhs, const concurrent_skip_list<Traits>& rhs ) {
+    return rhs < lhs;
+}
+
+template <typename Traits>
+bool operator<=( const concurrent_skip_list<Traits>& lhs, const concurrent_skip_list<Traits>& rhs ) {
+    return !(rhs < lhs);
+}
+
+template <typename Traits>
+bool operator>=( const concurrent_skip_list<Traits>& lhs, const concurrent_skip_list<Traits>& rhs ) {
+    return !(lhs < rhs);
+}
+#endif // __TBB_CPP20_COMPARISONS_PRESENT && __TBB_CPP20_CONCEPTS_PRESENT
+
+// Generates a number from the interval [0, MaxLevel).
+template <std::size_t MaxLevel>
+class concurrent_geometric_level_generator {
+public:
+    static constexpr std::size_t max_level = MaxLevel;
+    // TODO: modify the algorithm to accept other values of max_level
+    static_assert(max_level == 32, "Incompatible max_level for rng");
+
+    concurrent_geometric_level_generator() : engines(std::minstd_rand::result_type(time(nullptr))) {}
+
+    std::size_t operator()() {
+        // +1 is required to pass at least 1 into log2 (log2(0) is undefined)
+        // -1 is required to have an ability to return 0 from the generator (max_level - log2(2^31) - 1)
+        std::size_t result = max_level - std::size_t(tbb::detail::log2(engines.local()() + 1)) - 1;
+        __TBB_ASSERT(result <= max_level, nullptr);
+        return result;
+    }
+
+private:
+    tbb::enumerable_thread_specific<std::minstd_rand> engines;
+};
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#pragma warning(pop) // warning 4127 is back
+#endif
+
+#endif // __TBB_detail__concurrent_skip_list_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_concurrent_unordered_base.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_concurrent_unordered_base.h
new file mode 100644
index 0000000000..3abcce2b29
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_concurrent_unordered_base.h
@@ -0,0 +1,1500 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__concurrent_unordered_base_H
+#define __TBB_detail__concurrent_unordered_base_H
+
+#if !defined(__TBB_concurrent_unordered_map_H) && !defined(__TBB_concurrent_unordered_set_H)
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+#include "_range_common.h"
+#include "_containers_helpers.h"
+#include "_segment_table.h"
+#include "_hash_compare.h"
+#include "_allocator_traits.h"
+#include "_node_handle.h"
+#include "_assert.h"
+#include "_utils.h"
+#include "_exception.h"
+#include <iterator>
+#include <utility>
+#include <functional>
+#include <initializer_list>
+#include <atomic>
+#include <type_traits>
+#include <memory>
+#include <algorithm>
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#pragma warning(push)
+#pragma warning(disable: 4127) // warning C4127: conditional expression is constant
+#endif
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename Traits>
+class concurrent_unordered_base;
+
+template<typename Container, typename Value>
+class solist_iterator {
+private:
+    using node_ptr = typename Container::value_node_ptr;
+    template <typename T, typename Allocator>
+    friend class split_ordered_list;
+    template<typename M, typename V>
+    friend class solist_iterator;
+    template <typename Traits>
+    friend class concurrent_unordered_base;
+    template<typename M, typename T, typename U>
+    friend bool operator==( const solist_iterator<M,T>& i, const solist_iterator<M,U>& j );
+    template<typename M, typename T, typename U>
+    friend bool operator!=( const solist_iterator<M,T>& i, const solist_iterator<M,U>& j );
+public:
+    using value_type = Value;
+    using difference_type = typename Container::difference_type;
+    using pointer = value_type*;
+    using reference = value_type&;
+    using iterator_category = std::forward_iterator_tag;
+
+    solist_iterator() : my_node_ptr(nullptr) {}
+    solist_iterator( const solist_iterator<Container, typename Container::value_type>& other )
+        : my_node_ptr(other.my_node_ptr) {}
+
+    solist_iterator& operator=( const solist_iterator<Container, typename Container::value_type>& other ) {
+        my_node_ptr = other.my_node_ptr;
+        return *this;
+    }
+
+    reference operator*() const {
+        return my_node_ptr->value();
+    }
+
+    pointer operator->() const {
+        return my_node_ptr->storage();
+    }
+
+    solist_iterator& operator++() {
+        auto next_node = my_node_ptr->next();
+        while(next_node && next_node->is_dummy()) {
+            next_node = next_node->next();
+        }
+        my_node_ptr = static_cast<node_ptr>(next_node);
+        return *this;
+    }
+
+    solist_iterator operator++(int) {
+        solist_iterator tmp = *this;
+        ++*this;
+        return tmp;
+    }
+
+private:
+    solist_iterator( node_ptr pnode ) : my_node_ptr(pnode) {}
+
+    node_ptr get_node_ptr() const { return my_node_ptr; }
+
+    node_ptr my_node_ptr;
+};
+
+template<typename Solist, typename T, typename U>
+bool operator==( const solist_iterator<Solist, T>& i, const solist_iterator<Solist, U>& j ) {
+    return i.my_node_ptr == j.my_node_ptr;
+}
+
+template<typename Solist, typename T, typename U>
+bool operator!=( const solist_iterator<Solist, T>& i, const solist_iterator<Solist, U>& j ) {
+    return i.my_node_ptr != j.my_node_ptr;
+}
+
+template <typename SokeyType>
+class list_node {
+public:
+    using node_ptr = list_node*;
+    using sokey_type = SokeyType;
+
+    list_node(sokey_type key) : my_next(nullptr), my_order_key(key) {}
+
+    void init( sokey_type key ) {
+        my_order_key = key;
+    }
+
+    sokey_type order_key() const {
+        return my_order_key;
+    }
+
+    bool is_dummy() {
+        // The last bit of order key is unset for dummy nodes
+        return (my_order_key & 0x1) == 0;
+    }
+
+    node_ptr next() const {
+        return my_next.load(std::memory_order_acquire);
+    }
+
+    void set_next( node_ptr next_node ) {
+        my_next.store(next_node, std::memory_order_release);
+    }
+
+    bool try_set_next( node_ptr expected_next, node_ptr new_next ) {
+        return my_next.compare_exchange_strong(expected_next, new_next);
+    }
+
+private:
+    std::atomic<node_ptr> my_next;
+    sokey_type my_order_key;
+}; // class list_node
+
+template <typename ValueType, typename SokeyType>
+class value_node : public list_node<SokeyType>
+{
+public:
+    using base_type = list_node<SokeyType>;
+    using sokey_type = typename base_type::sokey_type;
+    using value_type = ValueType;
+
+    value_node( sokey_type ord_key ) : base_type(ord_key) {}
+    ~value_node() {}
+    value_type* storage() {
+        return reinterpret_cast<value_type*>(&my_value);
+    }
+
+    value_type& value() {
+        return *storage();
+    }
+
+private:
+    using aligned_storage_type = typename std::aligned_storage<sizeof(value_type)>::type;
+    aligned_storage_type my_value;
+}; // class value_node
+
+template <typename Traits>
+class concurrent_unordered_base {
+    using self_type = concurrent_unordered_base<Traits>;
+    using traits_type = Traits;
+    using hash_compare_type = typename traits_type::hash_compare_type;
+    class unordered_segment_table;
+public:
+    using value_type = typename traits_type::value_type;
+    using key_type = typename traits_type::key_type;
+    using allocator_type = typename traits_type::allocator_type;
+
+private:
+    using allocator_traits_type = tbb::detail::allocator_traits<allocator_type>;
+    // TODO: check assert conditions for different C++ standards
+    static_assert(std::is_same<typename allocator_traits_type::value_type, value_type>::value,
+                  "value_type of the container must be the same as its allocator");
+    using sokey_type = std::size_t;
+
+public:
+    using size_type = std::size_t;
+    using difference_type = std::ptrdiff_t;
+
+    using iterator = solist_iterator<self_type, value_type>;
+    using const_iterator = solist_iterator<self_type, const value_type>;
+    using local_iterator = iterator;
+    using const_local_iterator = const_iterator;
+
+    using reference = value_type&;
+    using const_reference = const value_type&;
+    using pointer = typename allocator_traits_type::pointer;
+    using const_pointer = typename allocator_traits_type::const_pointer;
+
+    using hasher = typename hash_compare_type::hasher;
+    using key_equal = typename hash_compare_type::key_equal;
+
+private:
+    using list_node_type = list_node<sokey_type>;
+    using value_node_type = value_node<value_type, sokey_type>;
+    using node_ptr = list_node_type*;
+    using value_node_ptr = value_node_type*;
+
+    using value_node_allocator_type = typename allocator_traits_type::template rebind_alloc<value_node_type>;
+    using node_allocator_type = typename allocator_traits_type::template rebind_alloc<list_node_type>;
+
+    using node_allocator_traits = tbb::detail::allocator_traits<node_allocator_type>;
+    using value_node_allocator_traits = tbb::detail::allocator_traits<value_node_allocator_type>;
+
+    static constexpr size_type round_up_to_power_of_two( size_type bucket_count ) {
+        return size_type(1) << size_type(tbb::detail::log2(uintptr_t(bucket_count == 0 ? 1 : bucket_count) * 2 - 1));
+    }
+
+    template <typename T>
+    using is_transparent = dependent_bool<has_transparent_key_equal<key_type, hasher, key_equal>, T>;
+public:
+    using node_type = node_handle<key_type, value_type, value_node_type, allocator_type>;
+
+    explicit concurrent_unordered_base( size_type bucket_count, const hasher& hash = hasher(),
+                                        const key_equal& equal = key_equal(), const allocator_type& alloc = allocator_type() )
+        : my_size(0),
+          my_bucket_count(round_up_to_power_of_two(bucket_count)),
+          my_max_load_factor(float(initial_max_load_factor)),
+          my_hash_compare(hash, equal),
+          my_head(sokey_type(0)),
+          my_segments(alloc) {}
+
+    concurrent_unordered_base() : concurrent_unordered_base(initial_bucket_count) {}
+
+    concurrent_unordered_base( size_type bucket_count, const allocator_type& alloc )
+        : concurrent_unordered_base(bucket_count, hasher(), key_equal(), alloc) {}
+
+    concurrent_unordered_base( size_type bucket_count, const hasher& hash, const allocator_type& alloc )
+        : concurrent_unordered_base(bucket_count, hash, key_equal(), alloc) {}
+
+    explicit concurrent_unordered_base( const allocator_type& alloc )
+        : concurrent_unordered_base(initial_bucket_count, hasher(), key_equal(), alloc) {}
+
+    template <typename InputIterator>
+    concurrent_unordered_base( InputIterator first, InputIterator last,
+                               size_type bucket_count = initial_bucket_count, const hasher& hash = hasher(),
+                               const key_equal& equal = key_equal(), const allocator_type& alloc = allocator_type() )
+        : concurrent_unordered_base(bucket_count, hash, equal, alloc)
+    {
+        insert(first, last);
+    }
+
+    template <typename InputIterator>
+    concurrent_unordered_base( InputIterator first, InputIterator last,
+                               size_type bucket_count, const allocator_type& alloc )
+        : concurrent_unordered_base(first, last, bucket_count, hasher(), key_equal(), alloc) {}
+
+    template <typename InputIterator>
+    concurrent_unordered_base( InputIterator first, InputIterator last,
+                               size_type bucket_count, const hasher& hash, const allocator_type& alloc )
+        : concurrent_unordered_base(first, last, bucket_count, hash, key_equal(), alloc) {}
+
+    concurrent_unordered_base( const concurrent_unordered_base& other )
+        : my_size(other.my_size.load(std::memory_order_relaxed)),
+          my_bucket_count(other.my_bucket_count.load(std::memory_order_relaxed)),
+          my_max_load_factor(other.my_max_load_factor),
+          my_hash_compare(other.my_hash_compare),
+          my_head(other.my_head.order_key()),
+          my_segments(other.my_segments)
+    {
+        try_call( [&] {
+            internal_copy(other);
+        } ).on_exception( [&] {
+            clear();
+        });
+    }
+
+    concurrent_unordered_base( const concurrent_unordered_base& other, const allocator_type& alloc )
+        : my_size(other.my_size.load(std::memory_order_relaxed)),
+          my_bucket_count(other.my_bucket_count.load(std::memory_order_relaxed)),
+          my_max_load_factor(other.my_max_load_factor),
+          my_hash_compare(other.my_hash_compare),
+          my_head(other.my_head.order_key()),
+          my_segments(other.my_segments, alloc)
+    {
+        try_call( [&] {
+            internal_copy(other);
+        } ).on_exception( [&] {
+            clear();
+        });
+    }
+
+    concurrent_unordered_base( concurrent_unordered_base&& other )
+        : my_size(other.my_size.load(std::memory_order_relaxed)),
+          my_bucket_count(other.my_bucket_count.load(std::memory_order_relaxed)),
+          my_max_load_factor(std::move(other.my_max_load_factor)),
+          my_hash_compare(std::move(other.my_hash_compare)),
+          my_head(other.my_head.order_key()),
+          my_segments(std::move(other.my_segments))
+    {
+        move_content(std::move(other));
+    }
+
+    concurrent_unordered_base( concurrent_unordered_base&& other, const allocator_type& alloc )
+        : my_size(other.my_size.load(std::memory_order_relaxed)),
+          my_bucket_count(other.my_bucket_count.load(std::memory_order_relaxed)),
+          my_max_load_factor(std::move(other.my_max_load_factor)),
+          my_hash_compare(std::move(other.my_hash_compare)),
+          my_head(other.my_head.order_key()),
+          my_segments(std::move(other.my_segments), alloc)
+    {
+        using is_always_equal = typename allocator_traits_type::is_always_equal;
+        internal_move_construct_with_allocator(std::move(other), alloc, is_always_equal());
+    }
+
+    concurrent_unordered_base( std::initializer_list<value_type> init,
+                               size_type bucket_count = initial_bucket_count,
+                               const hasher& hash = hasher(), const key_equal& equal = key_equal(),
+                               const allocator_type& alloc = allocator_type() )
+        : concurrent_unordered_base(init.begin(), init.end(), bucket_count, hash, equal, alloc) {}
+
+    concurrent_unordered_base( std::initializer_list<value_type> init,
+                               size_type bucket_count, const allocator_type& alloc )
+        : concurrent_unordered_base(init, bucket_count, hasher(), key_equal(), alloc) {}
+
+    concurrent_unordered_base( std::initializer_list<value_type> init,
+                               size_type bucket_count, const hasher& hash, const allocator_type& alloc )
+        : concurrent_unordered_base(init, bucket_count, hash, key_equal(), alloc) {}
+
+    ~concurrent_unordered_base() {
+        internal_clear();
+    }
+
+    concurrent_unordered_base& operator=( const concurrent_unordered_base& other ) {
+        if (this != &other) {
+            clear();
+            my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+            my_bucket_count.store(other.my_bucket_count.load(std::memory_order_relaxed), std::memory_order_relaxed);
+            my_max_load_factor = other.my_max_load_factor;
+            my_hash_compare = other.my_hash_compare;
+            my_segments = other.my_segments;
+            internal_copy(other); // TODO: guards for exceptions?
+        }
+        return *this;
+    }
+
+    concurrent_unordered_base& operator=( concurrent_unordered_base&& other ) noexcept(unordered_segment_table::is_noexcept_assignment) {
+        if (this != &other) {
+            clear();
+            my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+            my_bucket_count.store(other.my_bucket_count.load(std::memory_order_relaxed), std::memory_order_relaxed);
+            my_max_load_factor = std::move(other.my_max_load_factor);
+            my_hash_compare = std::move(other.my_hash_compare);
+            my_segments = std::move(other.my_segments);
+
+            using pocma_type = typename allocator_traits_type::propagate_on_container_move_assignment;
+            using is_always_equal = typename allocator_traits_type::is_always_equal;
+            internal_move_assign(std::move(other), tbb::detail::disjunction<pocma_type, is_always_equal>());
+        }
+        return *this;
+    }
+
+    concurrent_unordered_base& operator=( std::initializer_list<value_type> init ) {
+        clear();
+        insert(init);
+        return *this;
+    }
+
+    void swap( concurrent_unordered_base& other ) noexcept(unordered_segment_table::is_noexcept_swap) {
+        if (this != &other) {
+            using pocs_type = typename allocator_traits_type::propagate_on_container_swap;
+            using is_always_equal = typename allocator_traits_type::is_always_equal;
+            internal_swap(other, tbb::detail::disjunction<pocs_type, is_always_equal>());
+        }
+    }
+
+    allocator_type get_allocator() const noexcept { return my_segments.get_allocator(); }
+
+    iterator begin() noexcept { return iterator(first_value_node(&my_head)); }
+    const_iterator begin() const noexcept { return const_iterator(first_value_node(const_cast<node_ptr>(&my_head))); }
+    const_iterator cbegin() const noexcept { return const_iterator(first_value_node(const_cast<node_ptr>(&my_head))); }
+
+    iterator end() noexcept { return iterator(nullptr); }
+    const_iterator end() const noexcept { return const_iterator(nullptr); }
+    const_iterator cend() const noexcept { return const_iterator(nullptr); }
+
+    __TBB_nodiscard bool empty() const noexcept { return size() == 0; }
+    size_type size() const noexcept { return my_size.load(std::memory_order_relaxed); }
+    size_type max_size() const noexcept { return allocator_traits_type::max_size(get_allocator()); }
+
+    void clear() noexcept {
+        internal_clear();
+    }
+
+    std::pair<iterator, bool> insert( const value_type& value ) {
+        return internal_insert_value(value);
+    }
+
+    std::pair<iterator, bool> insert( value_type&& value ) {
+        return internal_insert_value(std::move(value));
+    }
+
+    iterator insert( const_iterator, const value_type& value ) {
+        // Ignore hint
+        return insert(value).first;
+    }
+
+    iterator insert( const_iterator, value_type&& value ) {
+        // Ignore hint
+        return insert(std::move(value)).first;
+    }
+
+    template <typename InputIterator>
+    void insert( InputIterator first, InputIterator last ) {
+        for (; first != last; ++first) {
+            insert(*first);
+        }
+    }
+
+    void insert( std::initializer_list<value_type> init ) {
+        insert(init.begin(), init.end());
+    }
+
+    std::pair<iterator, bool> insert( node_type&& nh ) {
+        if (!nh.empty()) {
+            value_node_ptr insert_node = node_handle_accessor::get_node_ptr(nh);
+            auto init_node = [&insert_node]( sokey_type order_key )->value_node_ptr {
+                insert_node->init(order_key);
+                return insert_node;
+            };
+            auto insert_result = internal_insert(insert_node->value(), init_node);
+            if (insert_result.inserted) {
+                // If the insertion succeeded - set node handle to the empty state
+                __TBB_ASSERT(insert_result.remaining_node == nullptr,
+                            "internal_insert_node should not return the remaining node if the insertion succeeded");
+                node_handle_accessor::deactivate(nh);
+            }
+            return { iterator(insert_result.node_with_equal_key), insert_result.inserted };
+        }
+        return {end(), false};
+    }
+
+    iterator insert( const_iterator, node_type&& nh ) {
+        // Ignore hint
+        return insert(std::move(nh)).first;
+    }
+
+    template <typename... Args>
+    std::pair<iterator, bool> emplace( Args&&... args ) {
+        // Create a node with temporary order_key 0, which will be reinitialize
+        // in internal_insert after the hash calculation
+        value_node_ptr insert_node = create_node(0, std::forward<Args>(args)...);
+
+        auto init_node = [&insert_node]( sokey_type order_key )->value_node_ptr {
+            insert_node->init(order_key);
+            return insert_node;
+        };
+
+        auto insert_result = internal_insert(insert_node->value(), init_node);
+
+        if (!insert_result.inserted) {
+            // If the insertion failed - destroy the node which was created
+            insert_node->init(split_order_key_regular(1));
+            destroy_node(insert_node);
+        }
+
+        return { iterator(insert_result.node_with_equal_key), insert_result.inserted };
+    }
+
+    template <typename... Args>
+    iterator emplace_hint( const_iterator, Args&&... args ) {
+        // Ignore hint
+        return emplace(std::forward<Args>(args)...).first;
+    }
+
+    iterator unsafe_erase( const_iterator pos ) {
+        return iterator(first_value_node(internal_erase(pos.get_node_ptr())));
+    }
+
+    iterator unsafe_erase( iterator pos ) {
+        return iterator(first_value_node(internal_erase(pos.get_node_ptr())));
+    }
+
+    iterator unsafe_erase( const_iterator first, const_iterator last ) {
+        while(first != last) {
+            first = unsafe_erase(first);
+        }
+        return iterator(first.get_node_ptr());
+    }
+
+    size_type unsafe_erase( const key_type& key ) {
+        return internal_erase_by_key(key);
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value
+                            && !std::is_convertible<K, const_iterator>::value
+                            && !std::is_convertible<K, iterator>::value,
+                            size_type>::type unsafe_erase( const K& key )
+    {
+        return internal_erase_by_key(key);
+    }
+
+    node_type unsafe_extract( const_iterator pos ) {
+        internal_extract(pos.get_node_ptr());
+        return node_handle_accessor::construct<node_type>(pos.get_node_ptr());
+    }
+
+    node_type unsafe_extract( iterator pos ) {
+        internal_extract(pos.get_node_ptr());
+        return node_handle_accessor::construct<node_type>(pos.get_node_ptr());
+    }
+
+    node_type unsafe_extract( const key_type& key ) {
+        iterator item = find(key);
+        return item == end() ? node_type() : unsafe_extract(item);
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value
+                            && !std::is_convertible<K, const_iterator>::value
+                            && !std::is_convertible<K, iterator>::value,
+                            node_type>::type unsafe_extract( const K& key )
+    {
+        iterator item = find(key);
+        return item == end() ? node_type() : unsafe_extract(item);
+    }
+
+    // Lookup functions
+    iterator find( const key_type& key ) {
+        value_node_ptr result = internal_find(key);
+        return result == nullptr ? end() : iterator(result);
+    }
+
+    const_iterator find( const key_type& key ) const {
+        value_node_ptr result = const_cast<self_type*>(this)->internal_find(key);
+        return result == nullptr ? end() : const_iterator(result);
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, iterator>::type find( const K& key ) {
+        value_node_ptr result = internal_find(key);
+        return result == nullptr ? end() : iterator(result);
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, const_iterator>::type find( const K& key ) const {
+        value_node_ptr result = const_cast<self_type*>(this)->internal_find(key);
+        return result == nullptr ? end() : const_iterator(result);
+    }
+
+    std::pair<iterator, iterator> equal_range( const key_type& key ) {
+        auto result = internal_equal_range(key);
+        return std::make_pair(iterator(result.first), iterator(result.second));
+    }
+
+    std::pair<const_iterator, const_iterator> equal_range( const key_type& key ) const {
+        auto result = const_cast<self_type*>(this)->internal_equal_range(key);
+        return std::make_pair(const_iterator(result.first), const_iterator(result.second));
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, std::pair<iterator, iterator>>::type equal_range( const K& key ) {
+        auto result = internal_equal_range(key);
+        return std::make_pair(iterator(result.first), iterator(result.second));
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, std::pair<const_iterator, const_iterator>>::type equal_range( const K& key ) const {
+        auto result = const_cast<self_type*>(this)->internal_equal_range(key);
+        return std::make_pair(iterator(result.first), iterator(result.second));
+    }
+
+    size_type count( const key_type& key ) const {
+        return internal_count(key);
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, size_type>::type count( const K& key ) const {
+        return internal_count(key);
+    }
+
+    bool contains( const key_type& key ) const {
+        return find(key) != end();
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, bool>::type contains( const K& key ) const {
+        return find(key) != end();
+    }
+
+    // Bucket interface
+    local_iterator unsafe_begin( size_type n ) {
+        return local_iterator(first_value_node(get_bucket(n)));
+    }
+
+    const_local_iterator unsafe_begin( size_type n ) const {
+        auto bucket_begin = first_value_node(const_cast<self_type*>(this)->get_bucket(n));
+        return const_local_iterator(bucket_begin);
+    }
+
+    const_local_iterator unsafe_cbegin( size_type n ) const {
+        auto bucket_begin = first_value_node(const_cast<self_type*>(this)->get_bucket(n));
+        return const_local_iterator(bucket_begin);
+    }
+
+    local_iterator unsafe_end( size_type n ) {
+        size_type bucket_count = my_bucket_count.load(std::memory_order_relaxed);
+        return n != bucket_count - 1 ? unsafe_begin(get_next_bucket_index(n)) : local_iterator(nullptr);
+    }
+
+    const_local_iterator unsafe_end( size_type n ) const {
+        size_type bucket_count = my_bucket_count.load(std::memory_order_relaxed);
+        return n != bucket_count - 1 ? unsafe_begin(get_next_bucket_index(n)) : const_local_iterator(nullptr);
+    }
+
+    const_local_iterator unsafe_cend( size_type n ) const {
+        size_type bucket_count = my_bucket_count.load(std::memory_order_relaxed);
+        return n != bucket_count - 1 ? unsafe_begin(get_next_bucket_index(n)) : const_local_iterator(nullptr);
+    }
+
+    size_type unsafe_bucket_count() const { return my_bucket_count.load(std::memory_order_relaxed); }
+
+    size_type unsafe_max_bucket_count() const {
+        return max_size();
+    }
+
+    size_type unsafe_bucket_size( size_type n ) const {
+        return size_type(std::distance(unsafe_begin(n), unsafe_end(n)));
+    }
+
+    size_type unsafe_bucket( const key_type& key ) const {
+        return my_hash_compare(key) % my_bucket_count.load(std::memory_order_relaxed);
+    }
+
+    // Hash policy
+    float load_factor() const {
+        return float(size() / float(my_bucket_count.load(std::memory_order_acquire)));
+    }
+
+    float max_load_factor() const { return my_max_load_factor; }
+
+    void max_load_factor( float mlf ) {
+        if (mlf != mlf || mlf < 0) {
+            tbb::detail::throw_exception(exception_id::invalid_load_factor);
+        }
+        my_max_load_factor = mlf;
+    } // TODO: unsafe?
+
+    void rehash( size_type bucket_count ) {
+        size_type current_bucket_count = my_bucket_count.load(std::memory_order_acquire);
+        if (current_bucket_count < bucket_count) {
+            // TODO: do we need do-while here?
+            my_bucket_count.compare_exchange_strong(current_bucket_count, round_up_to_power_of_two(bucket_count));
+        }
+    }
+
+    void reserve( size_type elements_count ) {
+        size_type current_bucket_count = my_bucket_count.load(std::memory_order_acquire);
+        size_type necessary_bucket_count = current_bucket_count;
+
+        do {
+            // TODO: Log2 seems useful here
+            while (necessary_bucket_count * max_load_factor() < elements_count) {
+                necessary_bucket_count <<= 1;
+            }
+        } while (current_bucket_count >= necessary_bucket_count ||
+                 !my_bucket_count.compare_exchange_strong(current_bucket_count, necessary_bucket_count));
+    }
+
+    // Observers
+    hasher hash_function() const { return my_hash_compare.hash_function(); }
+    key_equal key_eq() const { return my_hash_compare.key_eq(); }
+
+    class const_range_type {
+    private:
+        const concurrent_unordered_base& my_instance;
+        node_ptr my_begin_node; // may be node* const
+        node_ptr my_end_node;
+        mutable node_ptr my_midpoint_node;
+    public:
+        using size_type = typename concurrent_unordered_base::size_type;
+        using value_type = typename concurrent_unordered_base::value_type;
+        using reference = typename concurrent_unordered_base::reference;
+        using difference_type = typename concurrent_unordered_base::difference_type;
+        using iterator = typename concurrent_unordered_base::const_iterator;
+
+        bool empty() const { return my_begin_node == my_end_node; }
+
+        bool is_divisible() const {
+            return my_midpoint_node != my_end_node;
+        }
+
+        size_type grainsize() const { return 1; }
+
+        const_range_type( const_range_type& range, split )
+            : my_instance(range.my_instance),
+              my_begin_node(range.my_midpoint_node),
+              my_end_node(range.my_end_node)
+        {
+            range.my_end_node = my_begin_node;
+            __TBB_ASSERT(!empty(), "Splitting despite the range is not divisible");
+            __TBB_ASSERT(!range.empty(), "Splitting despite the range is not divisible");
+            set_midpoint();
+            range.set_midpoint();
+        }
+
+        iterator begin() const { return iterator(my_instance.first_value_node(my_begin_node)); }
+        iterator end() const { return iterator(my_instance.first_value_node(my_end_node)); }
+
+        const_range_type( const concurrent_unordered_base& table )
+            : my_instance(table), my_begin_node(const_cast<node_ptr>(&table.my_head)), my_end_node(nullptr)
+        {
+            set_midpoint();
+        }
+    private:
+        void set_midpoint() const {
+            if (my_begin_node == my_end_node) {
+                my_midpoint_node = my_end_node;
+            } else {
+                sokey_type invalid_key = ~sokey_type(0);
+                sokey_type begin_key = my_begin_node != nullptr ? my_begin_node->order_key() : invalid_key;
+                sokey_type end_key = my_end_node != nullptr ? my_end_node->order_key() : invalid_key;
+
+                size_type mid_bucket = reverse_bits(begin_key + (end_key - begin_key) / 2) %
+                    my_instance.my_bucket_count.load(std::memory_order_relaxed);
+                while( my_instance.my_segments[mid_bucket].load(std::memory_order_relaxed) == nullptr) {
+                    mid_bucket = my_instance.get_parent(mid_bucket);
+                }
+                if (reverse_bits(mid_bucket) > begin_key) {
+                    // Found a dummy node between begin and end
+                    my_midpoint_node = my_instance.first_value_node(
+                        my_instance.my_segments[mid_bucket].load(std::memory_order_relaxed));
+                } else {
+                    // Didn't find a dummy node between begin and end
+                    my_midpoint_node = my_end_node;
+                }
+            }
+        }
+    }; // class const_range_type
+
+    class range_type : public const_range_type {
+    public:
+        using iterator = typename concurrent_unordered_base::iterator;
+        using const_range_type::const_range_type;
+
+        iterator begin() const { return iterator(const_range_type::begin().get_node_ptr()); }
+        iterator end() const { return iterator(const_range_type::end().get_node_ptr()); }
+    }; // class range_type
+
+    // Parallel iteration
+    range_type range() {
+        return range_type(*this);
+    }
+
+    const_range_type range() const {
+        return const_range_type(*this);
+    }
+protected:
+    static constexpr bool allow_multimapping = traits_type::allow_multimapping;
+
+private:
+    static constexpr size_type initial_bucket_count = 8;
+    static constexpr float initial_max_load_factor = 4; // TODO: consider 1?
+    static constexpr size_type pointers_per_embedded_table = sizeof(size_type) * 8 - 1;
+
+    class unordered_segment_table
+        : public segment_table<std::atomic<node_ptr>, allocator_type, unordered_segment_table, pointers_per_embedded_table>
+    {
+        using self_type = unordered_segment_table;
+        using atomic_node_ptr = std::atomic<node_ptr>;
+        using base_type = segment_table<std::atomic<node_ptr>, allocator_type, unordered_segment_table, pointers_per_embedded_table>;
+        using segment_type = typename base_type::segment_type;
+        using base_allocator_type = typename base_type::allocator_type;
+
+        using segment_allocator_type = typename allocator_traits_type::template rebind_alloc<atomic_node_ptr>;
+        using segment_allocator_traits = tbb::detail::allocator_traits<segment_allocator_type>;
+    public:
+        // Segment table for unordered containers should not be extended in the wait- free implementation
+        static constexpr bool allow_table_extending = false;
+        static constexpr bool is_noexcept_assignment = std::is_nothrow_move_assignable<hasher>::value &&
+                                                       std::is_nothrow_move_assignable<key_equal>::value &&
+                                                       segment_allocator_traits::is_always_equal::value;
+        static constexpr bool is_noexcept_swap = tbb::detail::is_nothrow_swappable<hasher>::value &&
+                                                 tbb::detail::is_nothrow_swappable<key_equal>::value &&
+                                                 segment_allocator_traits::is_always_equal::value;
+
+        // TODO: using base_type::base_type is not compiling on Windows and Intel Compiler - investigate
+        unordered_segment_table( const base_allocator_type& alloc = base_allocator_type() )
+            : base_type(alloc) {}
+
+        unordered_segment_table( const unordered_segment_table& ) = default;
+
+        unordered_segment_table( const unordered_segment_table& other, const base_allocator_type& alloc )
+            : base_type(other, alloc) {}
+
+        unordered_segment_table( unordered_segment_table&& ) = default;
+
+        unordered_segment_table( unordered_segment_table&& other, const base_allocator_type& alloc )
+            : base_type(std::move(other), alloc) {}
+
+        unordered_segment_table& operator=( const unordered_segment_table& ) = default;
+
+        unordered_segment_table& operator=( unordered_segment_table&& ) = default;
+
+        segment_type create_segment( typename base_type::segment_table_type, typename base_type::segment_index_type segment_index, size_type ) {
+            segment_allocator_type alloc(this->get_allocator());
+            size_type seg_size = this->segment_size(segment_index);
+            segment_type new_segment = segment_allocator_traits::allocate(alloc, seg_size);
+            for (size_type i = 0; i != seg_size; ++i) {
+                segment_allocator_traits::construct(alloc, new_segment + i, nullptr);
+            }
+            return new_segment;
+        }
+
+        // deallocate_segment is required by the segment_table base class, but
+        // in unordered, it is also necessary to call the destructor during deallocation
+        void deallocate_segment( segment_type address, size_type index ) {
+            destroy_segment(address, index);
+        }
+
+        void destroy_segment( segment_type address, size_type index ) {
+            segment_allocator_type alloc(this->get_allocator());
+            for (size_type i = 0; i != this->segment_size(index); ++i) {
+                segment_allocator_traits::destroy(alloc, address + i);
+            }
+            segment_allocator_traits::deallocate(alloc, address, this->segment_size(index));
+        }
+
+
+        void copy_segment( size_type index, segment_type, segment_type to ) {
+            if (index == 0) {
+                // The first element in the first segment is embedded into the table (my_head)
+                // so the first pointer should not be stored here
+                // It would be stored during move ctor/assignment operation
+                to[1].store(nullptr, std::memory_order_relaxed);
+            } else {
+                for (size_type i = 0; i != this->segment_size(index); ++i) {
+                    to[i].store(nullptr, std::memory_order_relaxed);
+                }
+            }
+        }
+
+        void move_segment( size_type index, segment_type from, segment_type to ) {
+            if (index == 0) {
+                // The first element in the first segment is embedded into the table (my_head)
+                // so the first pointer should not be stored here
+                // It would be stored during move ctor/assignment operation
+                to[1].store(from[1].load(std::memory_order_relaxed), std::memory_order_relaxed);
+            } else {
+                for (size_type i = 0; i != this->segment_size(index); ++i) {
+                    to[i].store(from[i].load(std::memory_order_relaxed), std::memory_order_relaxed);
+                    from[i].store(nullptr, std::memory_order_relaxed);
+                }
+            }
+        }
+
+        // allocate_long_table is required by the segment_table base class, but unused for unordered containers
+        typename base_type::segment_table_type allocate_long_table( const typename base_type::atomic_segment*, size_type ) {
+            __TBB_ASSERT(false, "This method should never been called");
+            // TableType is a pointer
+            return nullptr;
+        }
+
+        // destroy_elements is required by the segment_table base class, but unused for unordered containers
+        // this function call but do nothing
+        void destroy_elements() {}
+    }; // struct unordered_segment_table
+
+    void internal_clear() {
+        // TODO: consider usefulness of two versions of clear() - with dummy nodes deallocation and without it
+        node_ptr next = my_head.next();
+        node_ptr curr = next;
+
+        my_head.set_next(nullptr);
+
+        while (curr != nullptr) {
+            next = curr->next();
+            destroy_node(curr);
+            curr = next;
+        }
+
+        my_size.store(0, std::memory_order_relaxed);
+        my_segments.clear();
+    }
+
+    void destroy_node( node_ptr node ) {
+        if (node->is_dummy()) {
+            node_allocator_type dummy_node_allocator(my_segments.get_allocator());
+            // Destroy the node
+            node_allocator_traits::destroy(dummy_node_allocator, node);
+            // Deallocate the memory
+            node_allocator_traits::deallocate(dummy_node_allocator, node, 1);
+        } else {
+            value_node_ptr val_node = static_cast<value_node_ptr>(node);
+            value_node_allocator_type value_node_allocator(my_segments.get_allocator());
+            // Destroy the value
+            value_node_allocator_traits::destroy(value_node_allocator, val_node->storage());
+            // Destroy the node
+            value_node_allocator_traits::destroy(value_node_allocator, val_node);
+            // Deallocate the memory
+            value_node_allocator_traits::deallocate(value_node_allocator, val_node, 1);
+        }
+    }
+
+    struct internal_insert_return_type {
+        // If the insertion failed - the remaining_node points to the node, which was failed to insert
+        // This node can be allocated in process of insertion
+        value_node_ptr remaining_node;
+        // If the insertion failed - node_with_equal_key points to the node in the list with the
+        // key, equivalent to the inserted, otherwise it points to the node, which was inserted.
+        value_node_ptr node_with_equal_key;
+        // Insertion status
+        // NOTE: if it is true - remaining_node should be nullptr
+        bool inserted;
+    }; // struct internal_insert_return_type
+
+    // Inserts the value into the split ordered list
+    template <typename ValueType>
+    std::pair<iterator, bool> internal_insert_value( ValueType&& value ) {
+
+        auto create_value_node = [&value, this]( sokey_type order_key )->value_node_ptr {
+            return create_node(order_key, std::forward<ValueType>(value));
+        };
+
+        auto insert_result = internal_insert(value, create_value_node);
+
+        if (insert_result.remaining_node != nullptr) {
+            // If the insertion fails - destroy the node which was failed to insert if it exist
+            __TBB_ASSERT(!insert_result.inserted,
+                         "remaining_node should be nullptr if the node was successfully inserted");
+            destroy_node(insert_result.remaining_node);
+        }
+
+        return { iterator(insert_result.node_with_equal_key), insert_result.inserted };
+    }
+
+    // Inserts the node into the split ordered list
+    // Creates a node using the specified callback after the place for insertion was found
+    // Returns internal_insert_return_type object, where:
+    //     - If the insertion succeeded:
+    //         - remaining_node is nullptr
+    //         - node_with_equal_key point to the inserted node
+    //         - inserted is true
+    //     - If the insertion failed:
+    //         - remaining_node points to the node, that was failed to insert if it was created.
+    //           nullptr if the node was not created, because the requested key was already
+    //           presented in the list
+    //         - node_with_equal_key point to the element in the list with the key, equivalent to
+    //           to the requested key
+    //         - inserted is false
+    template <typename ValueType, typename CreateInsertNode>
+    internal_insert_return_type internal_insert( ValueType&& value, CreateInsertNode create_insert_node ) {
+        static_assert(std::is_same<typename std::decay<ValueType>::type, value_type>::value,
+                      "Incorrect type in internal_insert");
+        const key_type& key = traits_type::get_key(value);
+        sokey_type hash_key = sokey_type(my_hash_compare(key));
+
+        sokey_type order_key = split_order_key_regular(hash_key);
+        node_ptr prev = prepare_bucket(hash_key);
+        __TBB_ASSERT(prev != nullptr, "Invalid head node");
+
+        auto search_result = search_after(prev, order_key, key);
+
+        if (search_result.second) {
+            return internal_insert_return_type{ nullptr, search_result.first, false };
+        }
+
+        value_node_ptr new_node = create_insert_node(order_key);
+        node_ptr curr = search_result.first;
+
+        while (!try_insert(prev, new_node, curr)) {
+            search_result = search_after(prev, order_key, key);
+            if (search_result.second) {
+                return internal_insert_return_type{ new_node, search_result.first, false };
+            }
+            curr = search_result.first;
+        }
+
+        auto sz = my_size.fetch_add(1);
+        adjust_table_size(sz + 1, my_bucket_count.load(std::memory_order_acquire));
+        return internal_insert_return_type{ nullptr, static_cast<value_node_ptr>(new_node), true };
+    }
+
+    // Searches the node with the key, equivalent to key with requested order key after the node prev
+    // Returns the existing node and true if the node is already in the list
+    // Returns the first node with the order key, greater than requested and false if the node is not presented in the list
+    std::pair<value_node_ptr, bool> search_after( node_ptr& prev, sokey_type order_key, const key_type& key ) {
+        // NOTE: static_cast<value_node_ptr>(curr) should be done only after we would ensure
+        // that the node is not a dummy node
+
+        node_ptr curr = prev->next();
+
+        while (curr != nullptr && (curr->order_key() < order_key ||
+               (curr->order_key() == order_key && !my_hash_compare(traits_type::get_key(static_cast<value_node_ptr>(curr)->value()), key))))
+        {
+            prev = curr;
+            curr = curr->next();
+        }
+
+        if (curr != nullptr && curr->order_key() == order_key && !allow_multimapping) {
+            return { static_cast<value_node_ptr>(curr), true };
+        }
+        return { static_cast<value_node_ptr>(curr), false };
+    }
+
+    void adjust_table_size( size_type total_elements, size_type current_size ) {
+        // Grow the table by a factor of 2 if possible and needed
+        if ( (float(total_elements) / float(current_size)) > my_max_load_factor ) {
+            // Double the size of the hash only if size hash not changed in between loads
+            my_bucket_count.compare_exchange_strong(current_size, 2u * current_size);
+        }
+    }
+
+    node_ptr insert_dummy_node( node_ptr parent_dummy_node, sokey_type order_key ) {
+        node_ptr prev_node = parent_dummy_node;
+
+        node_ptr dummy_node = create_dummy_node(order_key);
+        node_ptr next_node;
+
+        do {
+            next_node = prev_node->next();
+            // Move forward through the list while the order key is less than requested
+            while (next_node != nullptr && next_node->order_key() < order_key) {
+                prev_node = next_node;
+                next_node = next_node->next();
+            }
+
+            if (next_node != nullptr && next_node->order_key() == order_key) {
+                // Another dummy node with the same order key was inserted by another thread
+                // Destroy the node and exit
+                destroy_node(dummy_node);
+                return next_node;
+            }
+        } while (!try_insert(prev_node, dummy_node, next_node));
+
+        return dummy_node;
+    }
+
+    // Try to insert a node between prev_node and expected next
+    // If the next is not equal to expected next - return false
+    static bool try_insert( node_ptr prev_node, node_ptr new_node, node_ptr current_next_node ) {
+        new_node->set_next(current_next_node);
+        return prev_node->try_set_next(current_next_node, new_node);
+    }
+
+    // Returns the bucket, associated with the hash_key
+    node_ptr prepare_bucket( sokey_type hash_key ) {
+        size_type bucket = hash_key % my_bucket_count.load(std::memory_order_acquire);
+        return get_bucket(bucket);
+    }
+
+    // Initialize the corresponding bucket if it is not initialized
+    node_ptr get_bucket( size_type bucket_index ) {
+        if (my_segments[bucket_index].load(std::memory_order_acquire) == nullptr) {
+            init_bucket(bucket_index);
+        }
+        return my_segments[bucket_index].load(std::memory_order_acquire);
+    }
+
+    void init_bucket( size_type bucket ) {
+        if (bucket == 0) {
+            // Atomicaly store the first bucket into my_head
+            node_ptr disabled = nullptr;
+            my_segments[0].compare_exchange_strong(disabled, &my_head);
+            return;
+        }
+
+        size_type parent_bucket = get_parent(bucket);
+
+        while (my_segments[parent_bucket].load(std::memory_order_acquire) == nullptr) {
+            // Initialize all of the parent buckets
+            init_bucket(parent_bucket);
+        }
+
+        __TBB_ASSERT(my_segments[parent_bucket].load(std::memory_order_acquire) != nullptr, "Parent bucket should be initialized");
+        node_ptr parent = my_segments[parent_bucket].load(std::memory_order_acquire);
+
+        // Insert dummy node into the list
+        node_ptr dummy_node = insert_dummy_node(parent, split_order_key_dummy(bucket));
+        // TODO: consider returning pair<node_ptr, bool> to avoid store operation if the bucket was stored by an other thread
+        // or move store to insert_dummy_node
+        // Add dummy_node into the segment table
+        my_segments[bucket].store(dummy_node, std::memory_order_release);
+    }
+
+    node_ptr create_dummy_node( sokey_type order_key ) {
+        node_allocator_type dummy_node_allocator(my_segments.get_allocator());
+        node_ptr dummy_node = node_allocator_traits::allocate(dummy_node_allocator, 1);
+        node_allocator_traits::construct(dummy_node_allocator, dummy_node, order_key);
+        return dummy_node;
+    }
+
+    template <typename... Args>
+    value_node_ptr create_node( sokey_type order_key, Args&&... args ) {
+        value_node_allocator_type value_node_allocator(my_segments.get_allocator());
+        // Allocate memory for the value_node
+        value_node_ptr new_node = value_node_allocator_traits::allocate(value_node_allocator, 1);
+        // Construct the node
+        value_node_allocator_traits::construct(value_node_allocator, new_node, order_key);
+
+        // try_call API is not convenient here due to broken
+        // variadic capture on GCC 4.8.5
+        auto value_guard = make_raii_guard([&] {
+            value_node_allocator_traits::destroy(value_node_allocator, new_node);
+            value_node_allocator_traits::deallocate(value_node_allocator, new_node, 1);
+        });
+
+        // Construct the value in the node
+        value_node_allocator_traits::construct(value_node_allocator, new_node->storage(), std::forward<Args>(args)...);
+        value_guard.dismiss();
+        return new_node;
+    }
+
+    value_node_ptr first_value_node( node_ptr first_node ) const {
+        while (first_node != nullptr && first_node->is_dummy()) {
+            first_node = first_node->next();
+        }
+        return static_cast<value_node_ptr>(first_node);
+    }
+
+    // Unsafe method, which removes the node from the list and returns the next node
+    node_ptr internal_erase( value_node_ptr node_to_erase ) {
+        __TBB_ASSERT(node_to_erase != nullptr, "Invalid iterator for erase");
+        node_ptr next_node = node_to_erase->next();
+        internal_extract(node_to_erase);
+        destroy_node(node_to_erase);
+        return next_node;
+    }
+
+    template <typename K>
+    size_type internal_erase_by_key( const K& key ) {
+        // TODO: consider reimplementation without equal_range - it is not effective to perform lookup over a bucket
+        // for each unsafe_erase call
+        auto eq_range = equal_range(key);
+        size_type erased_count = 0;
+
+        for (auto it = eq_range.first; it != eq_range.second;) {
+            it = unsafe_erase(it);
+            ++erased_count;
+        }
+        return erased_count;
+    }
+
+    // Unsafe method, which extracts the node from the list
+    void internal_extract( value_node_ptr node_to_extract ) {
+        const key_type& key = traits_type::get_key(node_to_extract->value());
+        sokey_type hash_key = sokey_type(my_hash_compare(key));
+
+        node_ptr prev_node = prepare_bucket(hash_key);
+
+        for (node_ptr node = prev_node->next(); node != nullptr; prev_node = node, node = node->next()) {
+            if (node == node_to_extract) {
+                unlink_node(prev_node, node, node_to_extract->next());
+                my_size.store(my_size.load(std::memory_order_relaxed) - 1, std::memory_order_relaxed);
+                return;
+            }
+            __TBB_ASSERT(node->order_key() <= node_to_extract->order_key(),
+                         "node, which is going to be extracted should be presented in the list");
+        }
+    }
+
+protected:
+    template <typename SourceType>
+    void internal_merge( SourceType&& source ) {
+        static_assert(std::is_same<node_type, typename std::decay<SourceType>::type::node_type>::value,
+                      "Incompatible containers cannot be merged");
+
+        for (node_ptr source_prev = &source.my_head; source_prev->next() != nullptr;) {
+            if (!source_prev->next()->is_dummy()) {
+                value_node_ptr curr = static_cast<value_node_ptr>(source_prev->next());
+                // If the multimapping is allowed, or the key is not presented
+                // in the *this container - extract the node from the list
+                if (allow_multimapping || !contains(traits_type::get_key(curr->value()))) {
+                    node_ptr next_node = curr->next();
+                    source.unlink_node(source_prev, curr, next_node);
+
+                    // Remember the old order key
+                    sokey_type old_order_key = curr->order_key();
+
+                    // Node handle with curr cannot be used directly in insert call, because
+                    // the destructor of node_type will destroy curr
+                    node_type curr_node = node_handle_accessor::construct<node_type>(curr);
+
+                    // If the insertion fails - return ownership of the node to the source
+                    if (!insert(std::move(curr_node)).second) {
+                        __TBB_ASSERT(!allow_multimapping, "Insertion should succeed for multicontainer");
+                        __TBB_ASSERT(source_prev->next() == next_node,
+                                     "Concurrent operations with the source container in merge are prohibited");
+
+                        // Initialize the node with the old order key, because the order key
+                        // can change during the insertion
+                        curr->init(old_order_key);
+                        __TBB_ASSERT(old_order_key >= source_prev->order_key() &&
+                                     (next_node == nullptr || old_order_key <= next_node->order_key()),
+                                     "Wrong nodes order in the source container");
+                        // Merge is unsafe for source container, so the insertion back can be done without compare_exchange
+                        curr->set_next(next_node);
+                        source_prev->set_next(curr);
+                        source_prev = curr;
+                        node_handle_accessor::deactivate(curr_node);
+                    } else {
+                        source.my_size.fetch_sub(1, std::memory_order_relaxed);
+                    }
+                } else {
+                    source_prev = curr;
+                }
+            } else {
+                source_prev = source_prev->next();
+            }
+        }
+    }
+
+private:
+    // Unsafe method, which unlinks the node between prev and next
+    void unlink_node( node_ptr prev_node, node_ptr node_to_unlink, node_ptr next_node ) {
+        __TBB_ASSERT(prev_node->next() == node_to_unlink &&
+                     node_to_unlink->next() == next_node,
+                     "erasing and extracting nodes from the containers are unsafe in concurrent mode");
+        prev_node->set_next(next_node);
+        node_to_unlink->set_next(nullptr);
+    }
+
+    template <typename K>
+    value_node_ptr internal_find( const K& key ) {
+        sokey_type hash_key = sokey_type(my_hash_compare(key));
+        sokey_type order_key = split_order_key_regular(hash_key);
+
+        node_ptr curr = prepare_bucket(hash_key);
+
+        while (curr != nullptr) {
+            if (curr->order_key() > order_key) {
+                // If the order key is greater than the requested order key,
+                // the element is not in the hash table
+                return nullptr;
+            } else if (curr->order_key() == order_key &&
+                       my_hash_compare(traits_type::get_key(static_cast<value_node_ptr>(curr)->value()), key)) {
+                // The fact that order keys match does not mean that the element is found.
+                // Key function comparison has to be performed to check whether this is the
+                // right element. If not, keep searching while order key is the same.
+                return static_cast<value_node_ptr>(curr);
+            }
+            curr = curr->next();
+        }
+
+        return nullptr;
+    }
+
+    template <typename K>
+    std::pair<value_node_ptr, value_node_ptr> internal_equal_range( const K& key ) {
+        sokey_type hash_key = sokey_type(my_hash_compare(key));
+        sokey_type order_key = split_order_key_regular(hash_key);
+
+        node_ptr curr = prepare_bucket(hash_key);
+
+        while (curr != nullptr) {
+            if (curr->order_key() > order_key) {
+                // If the order key is greater than the requested order key,
+                // the element is not in the hash table
+                return std::make_pair(nullptr, nullptr);
+            } else if (curr->order_key() == order_key &&
+                       my_hash_compare(traits_type::get_key(static_cast<value_node_ptr>(curr)->value()), key)) {
+                value_node_ptr first = static_cast<value_node_ptr>(curr);
+                node_ptr last = first;
+                do {
+                    last = last->next();
+                } while (allow_multimapping && last != nullptr && !last->is_dummy() &&
+                        my_hash_compare(traits_type::get_key(static_cast<value_node_ptr>(last)->value()), key));
+                return std::make_pair(first, first_value_node(last));
+            }
+            curr = curr->next();
+        }
+        return {nullptr, nullptr};
+    }
+
+    template <typename K>
+    size_type internal_count( const K& key ) const {
+        if (allow_multimapping) {
+            // TODO: consider reimplementing the internal_equal_range with elements counting to avoid std::distance
+            auto eq_range = equal_range(key);
+            return std::distance(eq_range.first, eq_range.second);
+        } else {
+            return contains(key) ? 1 : 0;
+        }
+    }
+
+    void internal_copy( const concurrent_unordered_base& other ) {
+        node_ptr last_node = &my_head;
+        my_segments[0].store(&my_head, std::memory_order_relaxed);
+
+        for (node_ptr node = other.my_head.next(); node != nullptr; node = node->next()) {
+            node_ptr new_node;
+            if (!node->is_dummy()) {
+                // The node in the right table contains a value
+                new_node = create_node(node->order_key(), static_cast<value_node_ptr>(node)->value());
+            } else {
+                // The node in the right table is a dummy node
+                new_node = create_dummy_node(node->order_key());
+                my_segments[reverse_bits(node->order_key())].store(new_node, std::memory_order_relaxed);
+            }
+
+            last_node->set_next(new_node);
+            last_node = new_node;
+        }
+    }
+
+    void internal_move( concurrent_unordered_base&& other ) {
+        node_ptr last_node = &my_head;
+        my_segments[0].store(&my_head, std::memory_order_relaxed);
+
+        for (node_ptr node = other.my_head.next(); node != nullptr; node = node->next()) {
+            node_ptr new_node;
+            if (!node->is_dummy()) {
+                // The node in the right table contains a value
+                new_node = create_node(node->order_key(), std::move(static_cast<value_node_ptr>(node)->value()));
+            } else {
+                // TODO: do we need to destroy a dummy node in the right container?
+                // The node in the right table is a dummy_node
+                new_node = create_dummy_node(node->order_key());
+                my_segments[reverse_bits(node->order_key())].store(new_node, std::memory_order_relaxed);
+            }
+
+            last_node->set_next(new_node);
+            last_node = new_node;
+        }
+    }
+
+    void move_content( concurrent_unordered_base&& other ) {
+        // NOTE: allocators should be equal
+        my_head.set_next(other.my_head.next());
+        other.my_head.set_next(nullptr);
+        my_segments[0].store(&my_head, std::memory_order_relaxed);
+
+        other.my_bucket_count.store(initial_bucket_count, std::memory_order_relaxed);
+        other.my_max_load_factor = initial_max_load_factor;
+        other.my_size.store(0, std::memory_order_relaxed);
+    }
+
+    void internal_move_construct_with_allocator( concurrent_unordered_base&& other, const allocator_type&,
+                                                 /*is_always_equal = */std::true_type ) {
+        // Allocators are always equal - no need to compare for equality
+        move_content(std::move(other));
+    }
+
+    void internal_move_construct_with_allocator( concurrent_unordered_base&& other, const allocator_type& alloc,
+                                                 /*is_always_equal = */std::false_type ) {
+        // Allocators are not always equal
+        if (alloc == other.my_segments.get_allocator()) {
+            move_content(std::move(other));
+        } else {
+            try_call( [&] {
+                internal_move(std::move(other));
+            } ).on_exception( [&] {
+                clear();
+            });
+        }
+    }
+
+    // Move assigns the hash table to other is any instances of allocator_type are always equal
+    // or propagate_on_container_move_assignment is true
+    void internal_move_assign( concurrent_unordered_base&& other, /*is_always_equal || POCMA = */std::true_type ) {
+        move_content(std::move(other));
+    }
+
+    // Move assigns the hash table to other is any instances of allocator_type are not always equal
+    // and propagate_on_container_move_assignment is false
+    void internal_move_assign( concurrent_unordered_base&& other, /*is_always_equal || POCMA = */std::false_type ) {
+        if (my_segments.get_allocator() == other.my_segments.get_allocator()) {
+            move_content(std::move(other));
+        } else {
+            // TODO: guards for exceptions
+            internal_move(std::move(other));
+        }
+    }
+
+    void internal_swap( concurrent_unordered_base& other, /*is_always_equal || POCS = */std::true_type ) {
+        internal_swap_fields(other);
+    }
+
+    void internal_swap( concurrent_unordered_base& other, /*is_always_equal || POCS = */std::false_type ) {
+        __TBB_ASSERT(my_segments.get_allocator() == other.my_segments.get_allocator(),
+                     "Swapping with unequal allocators is not allowed");
+        internal_swap_fields(other);
+    }
+
+    void internal_swap_fields( concurrent_unordered_base& other ) {
+        node_ptr first_node = my_head.next();
+        my_head.set_next(other.my_head.next());
+        other.my_head.set_next(first_node);
+
+        size_type current_size = my_size.load(std::memory_order_relaxed);
+        my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        other.my_size.store(current_size, std::memory_order_relaxed);
+
+        size_type bucket_count = my_bucket_count.load(std::memory_order_relaxed);
+        my_bucket_count.store(other.my_bucket_count.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        other.my_bucket_count.store(bucket_count, std::memory_order_relaxed);
+
+        using std::swap;
+        swap(my_max_load_factor, other.my_max_load_factor);
+        swap(my_hash_compare, other.my_hash_compare);
+        my_segments.swap(other.my_segments);
+
+        // swap() method from segment table swaps all of the segments including the first segment
+        // We should restore it to my_head. Without it the first segment of the container will point
+        // to other.my_head.
+        my_segments[0].store(&my_head, std::memory_order_relaxed);
+        other.my_segments[0].store(&other.my_head, std::memory_order_relaxed);
+    }
+
+    // A regular order key has its original hash value reversed and the last bit set
+    static constexpr sokey_type split_order_key_regular( sokey_type hash ) {
+        return reverse_bits(hash) | 0x1;
+    }
+
+    // A dummy order key has its original hash value reversed and the last bit unset
+    static constexpr sokey_type split_order_key_dummy( sokey_type hash ) {
+        return reverse_bits(hash) & ~sokey_type(0x1);
+    }
+
+    size_type get_parent( size_type bucket ) const {
+        // Unset bucket's most significant turned-on bit
+        __TBB_ASSERT(bucket != 0, "Unable to get_parent of the bucket 0");
+        size_type msb = tbb::detail::log2(bucket);
+        return bucket & ~(size_type(1) << msb);
+    }
+
+    size_type get_next_bucket_index( size_type bucket ) const {
+        size_type bits = tbb::detail::log2(my_bucket_count.load(std::memory_order_relaxed));
+        size_type reversed_next = reverse_n_bits(bucket, bits) + 1;
+        return reverse_n_bits(reversed_next, bits);
+    }
+
+    std::atomic<size_type> my_size;
+    std::atomic<size_type> my_bucket_count;
+    float my_max_load_factor;
+    hash_compare_type my_hash_compare;
+
+    list_node_type my_head; // Head node for split ordered list
+    unordered_segment_table my_segments; // Segment table of pointers to nodes
+
+    template <typename Container, typename Value>
+    friend class solist_iterator;
+
+    template <typename OtherTraits>
+    friend class concurrent_unordered_base;
+}; // class concurrent_unordered_base
+
+template <typename Traits>
+bool operator==( const concurrent_unordered_base<Traits>& lhs,
+                 const concurrent_unordered_base<Traits>& rhs ) {
+    if (&lhs == &rhs) { return true; }
+    if (lhs.size() != rhs.size()) { return false; }
+
+#if _MSC_VER
+    // Passing "unchecked" iterators to std::permutation with 3 parameters
+    // causes compiler warnings.
+    // The workaround is to use overload with 4 parameters, which is
+    // available since C++14 - minimally supported version on MSVC
+    return std::is_permutation(lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
+#else
+    return std::is_permutation(lhs.begin(), lhs.end(), rhs.begin());
+#endif
+}
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+template <typename Traits>
+bool operator!=( const concurrent_unordered_base<Traits>& lhs,
+                 const concurrent_unordered_base<Traits>& rhs ) {
+    return !(lhs == rhs);
+}
+#endif
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#pragma warning(pop) // warning 4127 is back
+#endif
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__concurrent_unordered_base_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_config.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_config.h
new file mode 100644
index 0000000000..251ebb8d82
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_config.h
@@ -0,0 +1,483 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__config_H
+#define __TBB_detail__config_H
+
+/** This header is supposed to contain macro definitions only.
+    The macros defined here are intended to control such aspects of TBB build as
+    - presence of compiler features
+    - compilation modes
+    - feature sets
+    - known compiler/platform issues
+**/
+
+/* Check which standard library we use. */
+#include <cstddef>
+
+#if _MSC_VER
+    #define __TBB_EXPORTED_FUNC   __cdecl
+    #define __TBB_EXPORTED_METHOD __thiscall
+#else
+    #define __TBB_EXPORTED_FUNC
+    #define __TBB_EXPORTED_METHOD
+#endif
+
+#if defined(_MSVC_LANG)
+    #define __TBB_LANG _MSVC_LANG
+#else
+    #define __TBB_LANG __cplusplus
+#endif // _MSVC_LANG
+
+#define __TBB_CPP14_PRESENT (__TBB_LANG >= 201402L)
+#define __TBB_CPP17_PRESENT (__TBB_LANG >= 201703L)
+#define __TBB_CPP20_PRESENT (__TBB_LANG >= 201709L)
+
+#if __INTEL_COMPILER || _MSC_VER
+    #define __TBB_NOINLINE(decl) __declspec(noinline) decl
+#elif __GNUC__
+    #define __TBB_NOINLINE(decl) decl __attribute__ ((noinline))
+#else
+    #define __TBB_NOINLINE(decl) decl
+#endif
+
+#define __TBB_STRING_AUX(x) #x
+#define __TBB_STRING(x) __TBB_STRING_AUX(x)
+
+// Note that when ICC or Clang is in use, __TBB_GCC_VERSION might not fully match
+// the actual GCC version on the system.
+#define __TBB_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+
+/* Check which standard library we use. */
+
+// Prior to GCC 7, GNU libstdc++ did not have a convenient version macro.
+// Therefore we use different ways to detect its version.
+#ifdef TBB_USE_GLIBCXX_VERSION
+    // The version is explicitly specified in our public TBB_USE_GLIBCXX_VERSION macro.
+    // Its format should match the __TBB_GCC_VERSION above, e.g. 70301 for libstdc++ coming with GCC 7.3.1.
+    #define __TBB_GLIBCXX_VERSION TBB_USE_GLIBCXX_VERSION
+#elif _GLIBCXX_RELEASE && _GLIBCXX_RELEASE != __GNUC__
+    // Reported versions of GCC and libstdc++ do not match; trust the latter
+    #define __TBB_GLIBCXX_VERSION (_GLIBCXX_RELEASE*10000)
+#elif __GLIBCPP__ || __GLIBCXX__
+    // The version macro is not defined or matches the GCC version; use __TBB_GCC_VERSION
+    #define __TBB_GLIBCXX_VERSION __TBB_GCC_VERSION
+#endif
+
+#if __clang__
+    // according to clang documentation, version can be vendor specific
+    #define __TBB_CLANG_VERSION (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
+#endif
+
+/** Macro helpers **/
+
+#define __TBB_CONCAT_AUX(A,B) A##B
+// The additional level of indirection is needed to expand macros A and B (not to get the AB macro).
+// See [cpp.subst] and [cpp.concat] for more details.
+#define __TBB_CONCAT(A,B) __TBB_CONCAT_AUX(A,B)
+// The IGNORED argument and comma are needed to always have 2 arguments (even when A is empty).
+#define __TBB_IS_MACRO_EMPTY(A,IGNORED) __TBB_CONCAT_AUX(__TBB_MACRO_EMPTY,A)
+#define __TBB_MACRO_EMPTY 1
+
+#if _M_X64
+    #define __TBB_W(name) name##64
+#else
+    #define __TBB_W(name) name
+#endif
+
+/** User controlled TBB features & modes **/
+
+#ifndef TBB_USE_DEBUG
+    /*
+    There are four cases that are supported:
+    1. "_DEBUG is undefined" means "no debug";
+    2. "_DEBUG defined to something that is evaluated to 0" (including "garbage", as per [cpp.cond]) means "no debug";
+    3. "_DEBUG defined to something that is evaluated to a non-zero value" means "debug";
+    4. "_DEBUG defined to nothing (empty)" means "debug".
+    */
+    #ifdef _DEBUG
+        // Check if _DEBUG is empty.
+        #define __TBB_IS__DEBUG_EMPTY (__TBB_IS_MACRO_EMPTY(_DEBUG,IGNORED)==__TBB_MACRO_EMPTY)
+        #if __TBB_IS__DEBUG_EMPTY
+            #define TBB_USE_DEBUG 1
+        #else
+            #define TBB_USE_DEBUG _DEBUG
+        #endif // __TBB_IS__DEBUG_EMPTY
+    #else
+        #define TBB_USE_DEBUG 0
+    #endif // _DEBUG
+#endif // TBB_USE_DEBUG
+
+#ifndef TBB_USE_ASSERT
+    #define TBB_USE_ASSERT TBB_USE_DEBUG
+#endif // TBB_USE_ASSERT
+
+#ifndef TBB_USE_PROFILING_TOOLS
+#if TBB_USE_DEBUG
+    #define TBB_USE_PROFILING_TOOLS 2
+#else // TBB_USE_DEBUG
+    #define TBB_USE_PROFILING_TOOLS 0
+#endif // TBB_USE_DEBUG
+#endif // TBB_USE_PROFILING_TOOLS
+
+// Exceptions support cases
+#if !(__EXCEPTIONS || defined(_CPPUNWIND) || __SUNPRO_CC)
+    #if TBB_USE_EXCEPTIONS
+        #error Compilation settings do not support exception handling. Please do not set TBB_USE_EXCEPTIONS macro or set it to 0.
+    #elif !defined(TBB_USE_EXCEPTIONS)
+        #define TBB_USE_EXCEPTIONS 0
+    #endif
+#elif !defined(TBB_USE_EXCEPTIONS)
+    #define TBB_USE_EXCEPTIONS 1
+#endif
+
+/** Preprocessor symbols to determine HW architecture **/
+
+#if _WIN32 || _WIN64
+    #if defined(_M_X64) || defined(__x86_64__)  // the latter for MinGW support
+        #define __TBB_x86_64 1
+    #elif defined(_M_IA64)
+        #define __TBB_ipf 1
+    #elif defined(_M_IX86) || defined(__i386__) // the latter for MinGW support
+        #define __TBB_x86_32 1
+    #else
+        #define __TBB_generic_arch 1
+    #endif
+#else /* Assume generic Unix */
+    #if __x86_64__
+        #define __TBB_x86_64 1
+    #elif __ia64__
+        #define __TBB_ipf 1
+    #elif __i386__||__i386  // __i386 is for Sun OS
+        #define __TBB_x86_32 1
+    #else
+        #define __TBB_generic_arch 1
+    #endif
+#endif
+
+/** Windows API or POSIX API **/
+
+#if _WIN32 || _WIN64
+    #define __TBB_USE_WINAPI 1
+#else
+    #define __TBB_USE_POSIX 1
+#endif
+
+/** Internal TBB features & modes **/
+
+/** __TBB_DYNAMIC_LOAD_ENABLED describes the system possibility to load shared libraries at run time **/
+#ifndef __TBB_DYNAMIC_LOAD_ENABLED
+    #define __TBB_DYNAMIC_LOAD_ENABLED 1
+#endif
+
+/** __TBB_WIN8UI_SUPPORT enables support of Windows* Store Apps and limit a possibility to load
+    shared libraries at run time only from application container **/
+#if defined(WINAPI_FAMILY) && WINAPI_FAMILY == WINAPI_FAMILY_APP
+    #define __TBB_WIN8UI_SUPPORT 1
+#else
+    #define __TBB_WIN8UI_SUPPORT 0
+#endif
+
+/** __TBB_WEAK_SYMBOLS_PRESENT denotes that the system supports the weak symbol mechanism **/
+#ifndef __TBB_WEAK_SYMBOLS_PRESENT
+    #define __TBB_WEAK_SYMBOLS_PRESENT ( !_WIN32 && !__APPLE__ && !__sun && (__TBB_GCC_VERSION >= 40000 || __INTEL_COMPILER ) )
+#endif
+
+/** Presence of compiler features **/
+
+#if __clang__ && !__INTEL_COMPILER
+    #define __TBB_USE_OPTIONAL_RTTI __has_feature(cxx_rtti)
+#elif defined(_CPPRTTI)
+    #define __TBB_USE_OPTIONAL_RTTI 1
+#else
+    #define __TBB_USE_OPTIONAL_RTTI (__GXX_RTTI || __RTTI || __INTEL_RTTI__)
+#endif
+
+/** Library features presence macros **/
+
+#define __TBB_CPP14_INTEGER_SEQUENCE_PRESENT       (__TBB_LANG >= 201402L)
+#define __TBB_CPP17_INVOKE_RESULT_PRESENT          (__TBB_LANG >= 201703L)
+
+// TODO: Remove the condition(__INTEL_COMPILER > 2021) from the __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+// macro when this feature start working correctly on this compiler.
+#if __INTEL_COMPILER && (!_MSC_VER || __INTEL_CXX11_MOVE__)
+    #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (__TBB_LANG >= 201402L)
+    #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT   (__INTEL_COMPILER > 2021 && __TBB_LANG >= 201703L)
+    #define __TBB_CPP20_CONCEPTS_PRESENT           0 // TODO: add a mechanism for future addition
+#elif __clang__
+    #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (__has_feature(cxx_variable_templates))
+    #define __TBB_CPP20_CONCEPTS_PRESENT           0 // TODO: add a mechanism for future addition
+    #ifdef __cpp_deduction_guides
+        #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT (__cpp_deduction_guides >= 201611L)
+    #else
+        #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT 0
+    #endif
+#elif __GNUC__
+    #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (__TBB_LANG >= 201402L && __TBB_GCC_VERSION >= 50000)
+    #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT   (__cpp_deduction_guides >= 201606L)
+    #define __TBB_CPP20_CONCEPTS_PRESENT           (__TBB_LANG >= 201709L && __TBB_GCC_VERSION >= 100201)
+#elif _MSC_VER
+    #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (_MSC_FULL_VER >= 190023918 && (!__INTEL_COMPILER || __INTEL_COMPILER >= 1700))
+    #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT   (_MSC_VER >= 1914 && __TBB_LANG >= 201703L && (!__INTEL_COMPILER || __INTEL_COMPILER > 2021))
+    #define __TBB_CPP20_CONCEPTS_PRESENT           (_MSC_VER >= 1923 && __TBB_LANG >= 202002L) // TODO: INTEL_COMPILER?
+#else
+    #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (__TBB_LANG >= 201402L)
+    #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT   (__TBB_LANG >= 201703L)
+    #define __TBB_CPP20_CONCEPTS_PRESENT           (__TBB_LANG >= 202002L)
+#endif
+
+// GCC4.8 on RHEL7 does not support std::get_new_handler
+#define __TBB_CPP11_GET_NEW_HANDLER_PRESENT             (_MSC_VER >= 1900 || __TBB_GLIBCXX_VERSION >= 40900 && __GXX_EXPERIMENTAL_CXX0X__ || _LIBCPP_VERSION)
+// GCC4.8 on RHEL7 does not support std::is_trivially_copyable
+#define __TBB_CPP11_TYPE_PROPERTIES_PRESENT             (_LIBCPP_VERSION || _MSC_VER >= 1700 || (__TBB_GLIBCXX_VERSION >= 50000 && __GXX_EXPERIMENTAL_CXX0X__))
+
+#define __TBB_CPP17_MEMORY_RESOURCE_PRESENT             0
+#define __TBB_CPP17_HW_INTERFERENCE_SIZE_PRESENT        (_MSC_VER >= 1911)
+#define __TBB_CPP17_LOGICAL_OPERATIONS_PRESENT          (__TBB_LANG >= 201703L)
+#define __TBB_CPP17_ALLOCATOR_IS_ALWAYS_EQUAL_PRESENT   (__TBB_LANG >= 201703L)
+#define __TBB_CPP17_IS_SWAPPABLE_PRESENT                (__TBB_LANG >= 201703L)
+#define __TBB_CPP20_COMPARISONS_PRESENT                 __TBB_CPP20_PRESENT
+
+#if (!__TBB_WIN8UI_SUPPORT && !__ANDROID__ && !__APPLE__ && !defined(_musl_))
+#define __TBB_RESUMABLE_TASKS 1
+#else
+#define __TBB_RESUMABLE_TASKS 0
+#endif
+
+/* This macro marks incomplete code or comments describing ideas which are considered for the future.
+ * See also for plain comment with TODO and FIXME marks for small improvement opportunities.
+ */
+#define __TBB_TODO 0
+
+/* Check which standard library we use. */
+/* __TBB_SYMBOL is defined only while processing exported symbols list where C++ is not allowed. */
+#if !defined(__TBB_SYMBOL) && !__TBB_CONFIG_PREPROC_ONLY
+    #include <cstddef>
+#endif
+
+/** Target OS is either iOS* or iOS* simulator **/
+#if __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__
+    #define __TBB_IOS 1
+#endif
+
+#if __APPLE__
+    #if __INTEL_COMPILER && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ > 1099 \
+                         && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101000
+        // ICC does not correctly set the macro if -mmacosx-min-version is not specified
+        #define __TBB_MACOS_TARGET_VERSION  (100000 + 10*(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ - 1000))
+    #else
+        #define __TBB_MACOS_TARGET_VERSION  __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__
+    #endif
+#endif
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+    #define __TBB_GCC_WARNING_IGNORED_ATTRIBUTES_PRESENT (__TBB_GCC_VERSION >= 60100)
+#endif
+
+#define __TBB_CPP17_FALLTHROUGH_PRESENT (__TBB_LANG >= 201703L)
+#define __TBB_CPP17_NODISCARD_PRESENT   (__TBB_LANG >= 201703L)
+#define __TBB_FALLTHROUGH_PRESENT       (__TBB_GCC_VERSION >= 70000 && !__INTEL_COMPILER)
+
+#if __TBB_CPP17_FALLTHROUGH_PRESENT
+    #define __TBB_fallthrough [[fallthrough]]
+#elif __TBB_FALLTHROUGH_PRESENT
+    #define __TBB_fallthrough __attribute__ ((fallthrough))
+#else
+    #define __TBB_fallthrough
+#endif
+
+#if __TBB_CPP17_NODISCARD_PRESENT
+    #define __TBB_nodiscard [[nodiscard]]
+#elif __clang__ || __GNUC__
+    #define __TBB_nodiscard __attribute__((warn_unused_result))
+#else
+    #define __TBB_nodiscard
+#endif
+
+#define __TBB_CPP17_UNCAUGHT_EXCEPTIONS_PRESENT             (_MSC_VER >= 1900 || __GLIBCXX__ && __cpp_lib_uncaught_exceptions \
+                                                            || _LIBCPP_VERSION >= 3700 && (!__TBB_MACOS_TARGET_VERSION || __TBB_MACOS_TARGET_VERSION >= 101200) && !__TBB_IOS)
+
+
+#define __TBB_TSX_INTRINSICS_PRESENT ((__RTM__ || (_MSC_VER>=1700 && !__clang__) || __INTEL_COMPILER>=1300) && !__TBB_DEFINE_MIC && !__ANDROID__)
+
+#define __TBB_WAITPKG_INTRINSICS_PRESENT ((__INTEL_COMPILER >= 1900 || __TBB_GCC_VERSION >= 110000 || __TBB_CLANG_VERSION >= 120000) && !__ANDROID__)
+
+/** Internal TBB features & modes **/
+
+/** __TBB_SOURCE_DIRECTLY_INCLUDED is a mode used in whitebox testing when
+    it's necessary to test internal functions not exported from TBB DLLs
+**/
+#if (_WIN32||_WIN64) && (__TBB_SOURCE_DIRECTLY_INCLUDED || TBB_USE_PREVIEW_BINARY)
+    #define __TBB_NO_IMPLICIT_LINKAGE 1
+    #define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1
+#endif
+
+#if (__TBB_BUILD || __TBBMALLOC_BUILD || __TBBMALLOCPROXY_BUILD || __TBBBIND_BUILD) && !defined(__TBB_NO_IMPLICIT_LINKAGE)
+    #define __TBB_NO_IMPLICIT_LINKAGE 1
+#endif
+
+#if _MSC_VER
+    #if !__TBB_NO_IMPLICIT_LINKAGE
+        #ifdef _DEBUG
+            #pragma comment(lib, "tbb12_debug.lib")
+        #else
+            #pragma comment(lib, "tbb12.lib")
+        #endif
+    #endif
+#endif
+
+#ifndef __TBB_SCHEDULER_OBSERVER
+    #define __TBB_SCHEDULER_OBSERVER 1
+#endif /* __TBB_SCHEDULER_OBSERVER */
+
+#ifndef __TBB_FP_CONTEXT
+    #define __TBB_FP_CONTEXT 1
+#endif /* __TBB_FP_CONTEXT */
+
+#define __TBB_RECYCLE_TO_ENQUEUE __TBB_BUILD // keep non-official
+
+#ifndef __TBB_ARENA_OBSERVER
+    #define __TBB_ARENA_OBSERVER __TBB_SCHEDULER_OBSERVER
+#endif /* __TBB_ARENA_OBSERVER */
+
+#ifndef __TBB_ARENA_BINDING
+    #define __TBB_ARENA_BINDING 1
+#endif
+
+#if TBB_PREVIEW_WAITING_FOR_WORKERS || __TBB_BUILD
+    #define __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE 1
+#endif
+
+#if (TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION || __TBB_BUILD) && __TBB_ARENA_BINDING
+    #define __TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION_PRESENT 1
+#endif
+
+#ifndef __TBB_ENQUEUE_ENFORCED_CONCURRENCY
+    #define __TBB_ENQUEUE_ENFORCED_CONCURRENCY 1
+#endif
+
+#if !defined(__TBB_SURVIVE_THREAD_SWITCH) && \
+          (_WIN32 || _WIN64 || __APPLE__ || (__linux__ && !__ANDROID__))
+    #define __TBB_SURVIVE_THREAD_SWITCH 1
+#endif /* __TBB_SURVIVE_THREAD_SWITCH */
+
+#ifndef TBB_PREVIEW_FLOW_GRAPH_FEATURES
+    #define TBB_PREVIEW_FLOW_GRAPH_FEATURES __TBB_CPF_BUILD
+#endif
+
+#ifndef __TBB_DEFAULT_PARTITIONER
+    #define __TBB_DEFAULT_PARTITIONER tbb::auto_partitioner
+#endif
+
+#ifndef __TBB_FLOW_TRACE_CODEPTR
+    #define __TBB_FLOW_TRACE_CODEPTR __TBB_CPF_BUILD
+#endif
+
+// Intel(R) C++ Compiler starts analyzing usages of the deprecated content at the template
+// instantiation site, which is too late for suppression of the corresponding messages for internal
+// stuff.
+#if !defined(__INTEL_COMPILER) && (!defined(TBB_SUPPRESS_DEPRECATED_MESSAGES) || (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0))
+    #if (__TBB_LANG >= 201402L)
+        #define __TBB_DEPRECATED [[deprecated]]
+        #define __TBB_DEPRECATED_MSG(msg) [[deprecated(msg)]]
+    #elif _MSC_VER
+        #define __TBB_DEPRECATED __declspec(deprecated)
+        #define __TBB_DEPRECATED_MSG(msg) __declspec(deprecated(msg))
+    #elif (__GNUC__ && __TBB_GCC_VERSION >= 40805) || __clang__
+        #define __TBB_DEPRECATED __attribute__((deprecated))
+        #define __TBB_DEPRECATED_MSG(msg) __attribute__((deprecated(msg)))
+    #endif
+#endif  // !defined(TBB_SUPPRESS_DEPRECATED_MESSAGES) || (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0)
+
+#if !defined(__TBB_DEPRECATED)
+    #define __TBB_DEPRECATED
+    #define __TBB_DEPRECATED_MSG(msg)
+#elif !defined(__TBB_SUPPRESS_INTERNAL_DEPRECATED_MESSAGES)
+    // Suppress deprecated messages from self
+    #define __TBB_SUPPRESS_INTERNAL_DEPRECATED_MESSAGES 1
+#endif
+
+#if defined(TBB_SUPPRESS_DEPRECATED_MESSAGES) && (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0)
+    #define __TBB_DEPRECATED_VERBOSE __TBB_DEPRECATED
+    #define __TBB_DEPRECATED_VERBOSE_MSG(msg) __TBB_DEPRECATED_MSG(msg)
+#else
+    #define __TBB_DEPRECATED_VERBOSE
+    #define __TBB_DEPRECATED_VERBOSE_MSG(msg)
+#endif // (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0)
+
+#if (!defined(TBB_SUPPRESS_DEPRECATED_MESSAGES) || (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0)) && !(__TBB_LANG >= 201103L || _MSC_VER >= 1900)
+    #pragma message("TBB Warning: Support for C++98/03 is deprecated. Please use the compiler that supports C++11 features at least.")
+#endif
+
+#ifdef _VARIADIC_MAX
+    #define __TBB_VARIADIC_MAX _VARIADIC_MAX
+#else
+    #if _MSC_VER == 1700
+        #define __TBB_VARIADIC_MAX 5 // VS11 setting, issue resolved in VS12
+    #elif _MSC_VER == 1600
+        #define __TBB_VARIADIC_MAX 10 // VS10 setting
+    #else
+        #define __TBB_VARIADIC_MAX 15
+    #endif
+#endif
+
+/** Macros of the form __TBB_XXX_BROKEN denote known issues that are caused by
+    the bugs in compilers, standard or OS specific libraries. They should be
+    removed as soon as the corresponding bugs are fixed or the buggy OS/compiler
+    versions go out of the support list.
+**/
+
+// Some STL containers not support allocator traits in old GCC versions
+#if __GXX_EXPERIMENTAL_CXX0X__ && __TBB_GLIBCXX_VERSION <= 50301
+    #define TBB_ALLOCATOR_TRAITS_BROKEN 1
+#endif
+
+// GCC 4.8 C++ standard library implements std::this_thread::yield as no-op.
+#if __TBB_GLIBCXX_VERSION >= 40800 && __TBB_GLIBCXX_VERSION < 40900
+    #define __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN 1
+#endif
+
+/** End of __TBB_XXX_BROKEN macro section **/
+
+#if defined(_MSC_VER) && _MSC_VER>=1500 && !defined(__INTEL_COMPILER)
+    // A macro to suppress erroneous or benign "unreachable code" MSVC warning (4702)
+    #define __TBB_MSVC_UNREACHABLE_CODE_IGNORED 1
+#endif
+
+// Many OS versions (Android 4.0.[0-3] for example) need workaround for dlopen to avoid non-recursive loader lock hang
+// Setting the workaround for all compile targets ($APP_PLATFORM) below Android 4.4 (android-19)
+#if __ANDROID__
+    #include <android/api-level.h>
+#endif
+
+#define __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING (TBB_PREVIEW_FLOW_GRAPH_FEATURES)
+
+#ifndef __TBB_PREVIEW_CRITICAL_TASKS
+#define __TBB_PREVIEW_CRITICAL_TASKS            1
+#endif
+
+#ifndef __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+#define __TBB_PREVIEW_FLOW_GRAPH_NODE_SET       (TBB_PREVIEW_FLOW_GRAPH_FEATURES)
+#endif
+
+
+#if !defined(__APPLE__) || !defined(__MAC_OS_X_VERSION_MIN_REQUIRED) || __MAC_OS_X_VERSION_MIN_REQUIRED > 101500
+  #define __TBB_ALIGNAS_AVAILABLE 1
+#else
+  #define __TBB_ALIGNAS_AVAILABLE 0
+#endif
+
+#endif // __TBB_detail__config_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_containers_helpers.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_containers_helpers.h
new file mode 100644
index 0000000000..4dca07fa10
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_containers_helpers.h
@@ -0,0 +1,67 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__containers_helpers_H
+#define __TBB_detail__containers_helpers_H
+
+#include "_template_helpers.h"
+#include "_allocator_traits.h"
+#include <type_traits>
+#include <memory>
+#include <functional>
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+
+template <typename Compare, typename = void>
+struct comp_is_transparent : std::false_type {};
+
+template <typename Compare>
+struct comp_is_transparent<Compare, tbb::detail::void_t<typename Compare::is_transparent>> : std::true_type {};
+
+template <typename Key, typename Hasher, typename KeyEqual, typename = void >
+struct has_transparent_key_equal : std::false_type { using type = KeyEqual; };
+
+template <typename Key, typename Hasher, typename KeyEqual>
+struct has_transparent_key_equal<Key, Hasher, KeyEqual, tbb::detail::void_t<typename Hasher::transparent_key_equal>> : std::true_type {
+    using type = typename Hasher::transparent_key_equal;
+    static_assert(comp_is_transparent<type>::value, "Hash::transparent_key_equal::is_transparent is not valid or does not denote a type.");
+    static_assert((std::is_same<KeyEqual, std::equal_to<Key>>::value ||
+        std::is_same<typename Hasher::transparent_key_equal, KeyEqual>::value), "KeyEqual is a different type than equal_to<Key> or Hash::transparent_key_equal.");
+ };
+
+struct is_iterator_impl {
+template <typename T>
+using iter_traits_category = typename std::iterator_traits<T>::iterator_category;
+
+template <typename T>
+using input_iter_category = typename std::enable_if<std::is_base_of<std::input_iterator_tag, iter_traits_category<T>>::value>::type;
+}; // struct is_iterator_impl
+
+template <typename T>
+using is_input_iterator = supports<T, is_iterator_impl::iter_traits_category, is_iterator_impl::input_iter_category>;
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+template <typename T>
+inline constexpr bool is_input_iterator_v = is_input_iterator<T>::value;
+#endif
+
+} // inline namespace d0
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__containers_helpers_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_exception.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_exception.h
new file mode 100644
index 0000000000..9764209fa8
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_exception.h
@@ -0,0 +1,88 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__exception_H
+#define __TBB__exception_H
+
+#include "_config.h"
+
+#include <new>          // std::bad_alloc
+#include <exception>    // std::exception
+#if __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+#include <stdexcept>    // std::runtime_error
+#endif
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+enum class exception_id {
+    bad_alloc = 1,
+    bad_last_alloc,
+    user_abort,
+    nonpositive_step,
+    out_of_range,
+    reservation_length_error,
+    missing_wait,
+    invalid_load_factor,
+    invalid_key,
+    bad_tagged_msg_cast,
+    unsafe_wait,
+    last_entry
+};
+} // namespace d0
+
+namespace r1 {
+//! Exception for concurrent containers
+class bad_last_alloc : public std::bad_alloc {
+public:
+    const char* __TBB_EXPORTED_METHOD what() const noexcept(true) override;
+};
+
+//! Exception for user-initiated abort
+class user_abort : public std::exception {
+public:
+    const char* __TBB_EXPORTED_METHOD what() const noexcept(true) override;
+};
+
+//! Exception for missing wait on structured_task_group
+class missing_wait : public std::exception {
+public:
+    const char* __TBB_EXPORTED_METHOD what() const noexcept(true) override;
+};
+
+#if __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+//! Exception for impossible finalization of task_sheduler_handle
+class unsafe_wait : public std::runtime_error {
+public:
+    unsafe_wait(const char* msg) : std::runtime_error(msg) {}
+};
+#endif // __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+
+//! Gathers all throw operators in one place.
+/** Its purpose is to minimize code bloat that can be caused by throw operators
+    scattered in multiple places, especially in templates. **/
+void __TBB_EXPORTED_FUNC throw_exception ( exception_id );
+} // namespace r1
+
+inline namespace d0 {
+using r1::throw_exception;
+} // namespace d0
+
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB__exception_H
+
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_body_impl.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_body_impl.h
new file mode 100644
index 0000000000..34ba1efcaf
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_body_impl.h
@@ -0,0 +1,371 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__flow_graph_body_impl_H
+#define __TBB__flow_graph_body_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+// included in namespace tbb::detail::d1 (in flow_graph.h)
+
+typedef std::uint64_t tag_value;
+
+
+// TODO revamp: find out if there is already helper for has_policy.
+template<typename ... Policies> struct Policy {};
+
+template<typename ... Policies> struct has_policy;
+
+template<typename ExpectedPolicy, typename FirstPolicy, typename ...Policies>
+struct has_policy<ExpectedPolicy, FirstPolicy, Policies...> :
+    std::integral_constant<bool, has_policy<ExpectedPolicy, FirstPolicy>::value ||
+                                 has_policy<ExpectedPolicy, Policies...>::value> {};
+
+template<typename ExpectedPolicy, typename SinglePolicy>
+struct has_policy<ExpectedPolicy, SinglePolicy> :
+    std::integral_constant<bool, std::is_same<ExpectedPolicy, SinglePolicy>::value> {};
+
+template<typename ExpectedPolicy, typename ...Policies>
+struct has_policy<ExpectedPolicy, Policy<Policies...> > : has_policy<ExpectedPolicy, Policies...> {};
+
+namespace graph_policy_namespace {
+
+    struct rejecting { };
+    struct reserving { };
+    struct queueing  { };
+    struct lightweight  { };
+
+    // K == type of field used for key-matching.  Each tag-matching port will be provided
+    // functor that, given an object accepted by the port, will return the
+    /// field of type K being used for matching.
+    template<typename K, typename KHash=tbb_hash_compare<typename std::decay<K>::type > >
+    struct key_matching {
+        typedef K key_type;
+        typedef typename std::decay<K>::type base_key_type;
+        typedef KHash hash_compare_type;
+    };
+
+    // old tag_matching join's new specifier
+    typedef key_matching<tag_value> tag_matching;
+
+    // Aliases for Policy combinations
+    typedef Policy<queueing, lightweight> queueing_lightweight;
+    typedef Policy<rejecting, lightweight> rejecting_lightweight;
+
+} // namespace graph_policy_namespace
+
+// -------------- function_body containers ----------------------
+
+//! A functor that takes no input and generates a value of type Output
+template< typename Output >
+class input_body : no_assign {
+public:
+    virtual ~input_body() {}
+    virtual Output operator()(flow_control& fc) = 0;
+    virtual input_body* clone() = 0;
+};
+
+//! The leaf for input_body
+template< typename Output, typename Body>
+class input_body_leaf : public input_body<Output> {
+public:
+    input_body_leaf( const Body &_body ) : body(_body) { }
+    Output operator()(flow_control& fc) override { return body(fc); }
+    input_body_leaf* clone() override {
+        return new input_body_leaf< Output, Body >(body);
+    }
+    Body get_body() { return body; }
+private:
+    Body body;
+};
+
+//! A functor that takes an Input and generates an Output
+template< typename Input, typename Output >
+class function_body : no_assign {
+public:
+    virtual ~function_body() {}
+    virtual Output operator()(const Input &input) = 0;
+    virtual function_body* clone() = 0;
+};
+
+//! the leaf for function_body
+template <typename Input, typename Output, typename B>
+class function_body_leaf : public function_body< Input, Output > {
+public:
+    function_body_leaf( const B &_body ) : body(_body) { }
+    Output operator()(const Input &i) override { return body(i); }
+    B get_body() { return body; }
+    function_body_leaf* clone() override {
+        return new function_body_leaf< Input, Output, B >(body);
+    }
+private:
+    B body;
+};
+
+//! the leaf for function_body specialized for Input and output of continue_msg
+template <typename B>
+class function_body_leaf< continue_msg, continue_msg, B> : public function_body< continue_msg, continue_msg > {
+public:
+    function_body_leaf( const B &_body ) : body(_body) { }
+    continue_msg operator()( const continue_msg &i ) override {
+        body(i);
+        return i;
+    }
+    B get_body() { return body; }
+    function_body_leaf* clone() override {
+        return new function_body_leaf< continue_msg, continue_msg, B >(body);
+    }
+private:
+    B body;
+};
+
+//! the leaf for function_body specialized for Output of continue_msg
+template <typename Input, typename B>
+class function_body_leaf< Input, continue_msg, B> : public function_body< Input, continue_msg > {
+public:
+    function_body_leaf( const B &_body ) : body(_body) { }
+    continue_msg operator()(const Input &i) override {
+        body(i);
+        return continue_msg();
+    }
+    B get_body() { return body; }
+    function_body_leaf* clone() override {
+        return new function_body_leaf< Input, continue_msg, B >(body);
+    }
+private:
+    B body;
+};
+
+//! the leaf for function_body specialized for Input of continue_msg
+template <typename Output, typename B>
+class function_body_leaf< continue_msg, Output, B > : public function_body< continue_msg, Output > {
+public:
+    function_body_leaf( const B &_body ) : body(_body) { }
+    Output operator()(const continue_msg &i) override {
+        return body(i);
+    }
+    B get_body() { return body; }
+    function_body_leaf* clone() override {
+        return new function_body_leaf< continue_msg, Output, B >(body);
+    }
+private:
+    B body;
+};
+
+//! function_body that takes an Input and a set of output ports
+template<typename Input, typename OutputSet>
+class multifunction_body : no_assign {
+public:
+    virtual ~multifunction_body () {}
+    virtual void operator()(const Input &/* input*/, OutputSet &/*oset*/) = 0;
+    virtual multifunction_body* clone() = 0;
+    virtual void* get_body_ptr() = 0;
+};
+
+//! leaf for multifunction.  OutputSet can be a std::tuple or a vector.
+template<typename Input, typename OutputSet, typename B >
+class multifunction_body_leaf : public multifunction_body<Input, OutputSet> {
+public:
+    multifunction_body_leaf(const B &_body) : body(_body) { }
+    void operator()(const Input &input, OutputSet &oset) override {
+        body(input, oset); // body may explicitly put() to one or more of oset.
+    }
+    void* get_body_ptr() override { return &body; }
+    multifunction_body_leaf* clone() override {
+        return new multifunction_body_leaf<Input, OutputSet,B>(body);
+    }
+
+private:
+    B body;
+};
+
+// ------ function bodies for hash_buffers and key-matching joins.
+
+template<typename Input, typename Output>
+class type_to_key_function_body : no_assign {
+    public:
+        virtual ~type_to_key_function_body() {}
+        virtual Output operator()(const Input &input) = 0;  // returns an Output
+        virtual type_to_key_function_body* clone() = 0;
+};
+
+// specialization for ref output
+template<typename Input, typename Output>
+class type_to_key_function_body<Input,Output&> : no_assign {
+    public:
+        virtual ~type_to_key_function_body() {}
+        virtual const Output & operator()(const Input &input) = 0;  // returns a const Output&
+        virtual type_to_key_function_body* clone() = 0;
+};
+
+template <typename Input, typename Output, typename B>
+class type_to_key_function_body_leaf : public type_to_key_function_body<Input, Output> {
+public:
+    type_to_key_function_body_leaf( const B &_body ) : body(_body) { }
+    Output operator()(const Input &i) override { return body(i); }
+    type_to_key_function_body_leaf* clone() override {
+        return new type_to_key_function_body_leaf< Input, Output, B>(body);
+    }
+private:
+    B body;
+};
+
+template <typename Input, typename Output, typename B>
+class type_to_key_function_body_leaf<Input,Output&,B> : public type_to_key_function_body< Input, Output&> {
+public:
+    type_to_key_function_body_leaf( const B &_body ) : body(_body) { }
+    const Output& operator()(const Input &i) override {
+        return body(i);
+    }
+    type_to_key_function_body_leaf* clone() override {
+        return new type_to_key_function_body_leaf< Input, Output&, B>(body);
+    }
+private:
+    B body;
+};
+
+// --------------------------- end of function_body containers ------------------------
+
+// --------------------------- node task bodies ---------------------------------------
+
+//! A task that calls a node's forward_task function
+template< typename NodeType >
+class forward_task_bypass : public graph_task {
+    NodeType &my_node;
+public:
+    forward_task_bypass( graph& g, small_object_allocator& allocator, NodeType &n
+                         , node_priority_t node_priority = no_priority
+    ) : graph_task(g, allocator, node_priority),
+    my_node(n) {}
+
+    task* execute(execution_data& ed) override {
+        graph_task* next_task = my_node.forward_task();
+        if (SUCCESSFULLY_ENQUEUED == next_task)
+            next_task = nullptr;
+        else if (next_task)
+            next_task = prioritize_task(my_node.graph_reference(), *next_task);
+        finalize(ed);
+        return next_task;
+    }
+};
+
+//! A task that calls a node's apply_body_bypass function, passing in an input of type Input
+//  return the task* unless it is SUCCESSFULLY_ENQUEUED, in which case return NULL
+template< typename NodeType, typename Input >
+class apply_body_task_bypass : public graph_task {
+    NodeType &my_node;
+    Input my_input;
+public:
+
+    apply_body_task_bypass( graph& g, small_object_allocator& allocator, NodeType &n, const Input &i
+                            , node_priority_t node_priority = no_priority
+    ) : graph_task(g, allocator, node_priority),
+        my_node(n), my_input(i) {}
+
+    task* execute(execution_data& ed) override {
+        graph_task* next_task = my_node.apply_body_bypass( my_input );
+        if (SUCCESSFULLY_ENQUEUED == next_task)
+            next_task = nullptr;
+        else if (next_task)
+            next_task = prioritize_task(my_node.graph_reference(), *next_task);
+        finalize(ed);
+        return next_task;
+
+    }
+};
+
+//! A task that calls a node's apply_body_bypass function with no input
+template< typename NodeType >
+class input_node_task_bypass : public graph_task {
+    NodeType &my_node;
+public:
+    input_node_task_bypass( graph& g, small_object_allocator& allocator, NodeType &n )
+        : graph_task(g, allocator), my_node(n) {}
+
+    task* execute(execution_data& ed) override {
+        graph_task* next_task = my_node.apply_body_bypass( );
+        if (SUCCESSFULLY_ENQUEUED == next_task)
+            next_task = nullptr;
+        else if (next_task)
+            next_task = prioritize_task(my_node.graph_reference(), *next_task);
+        finalize(ed);
+        return next_task;
+    }
+
+};
+
+// ------------------------ end of node task bodies -----------------------------------
+
+template<typename T, typename DecrementType, typename DummyType = void>
+class threshold_regulator;
+
+template<typename T, typename DecrementType>
+class threshold_regulator<T, DecrementType,
+                  typename std::enable_if<std::is_integral<DecrementType>::value>::type>
+    : public receiver<DecrementType>, no_copy
+{
+    T* my_node;
+protected:
+
+    graph_task* try_put_task( const DecrementType& value ) override {
+        graph_task* result = my_node->decrement_counter( value );
+        if( !result )
+            result = SUCCESSFULLY_ENQUEUED;
+        return result;
+    }
+
+    graph& graph_reference() const override {
+        return my_node->my_graph;
+    }
+
+    template<typename U, typename V> friend class limiter_node;
+    void reset_receiver( reset_flags ) {}
+
+public:
+    threshold_regulator(T* owner) : my_node(owner) {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+};
+
+template<typename T>
+class threshold_regulator<T, continue_msg, void> : public continue_receiver, no_copy {
+
+    T *my_node;
+
+    graph_task* execute() override {
+        return my_node->decrement_counter( 1 );
+    }
+
+protected:
+
+    graph& graph_reference() const override {
+        return my_node->my_graph;
+    }
+
+public:
+
+    typedef continue_msg input_type;
+    typedef continue_msg output_type;
+    threshold_regulator(T* owner)
+        : continue_receiver( /*number_of_predecessors=*/0, no_priority ), my_node(owner)
+    {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+};
+
+#endif // __TBB__flow_graph_body_impl_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_cache_impl.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_cache_impl.h
new file mode 100644
index 0000000000..ac5564598b
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_cache_impl.h
@@ -0,0 +1,435 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__flow_graph_cache_impl_H
+#define __TBB__flow_graph_cache_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+// included in namespace tbb::detail::d1 (in flow_graph.h)
+
+//! A node_cache maintains a std::queue of elements of type T.  Each operation is protected by a lock.
+template< typename T, typename M=spin_mutex >
+class node_cache {
+    public:
+
+    typedef size_t size_type;
+
+    bool empty() {
+        typename mutex_type::scoped_lock lock( my_mutex );
+        return internal_empty();
+    }
+
+    void add( T &n ) {
+        typename mutex_type::scoped_lock lock( my_mutex );
+        internal_push(n);
+    }
+
+    void remove( T &n ) {
+        typename mutex_type::scoped_lock lock( my_mutex );
+        for ( size_t i = internal_size(); i != 0; --i ) {
+            T &s = internal_pop();
+            if ( &s == &n )
+                break;  // only remove one predecessor per request
+            internal_push(s);
+        }
+    }
+
+    void clear() {
+        while( !my_q.empty()) (void)my_q.pop();
+    }
+
+protected:
+
+    typedef M mutex_type;
+    mutex_type my_mutex;
+    std::queue< T * > my_q;
+
+    // Assumes lock is held
+    inline bool internal_empty( )  {
+        return my_q.empty();
+    }
+
+    // Assumes lock is held
+    inline size_type internal_size( )  {
+        return my_q.size();
+    }
+
+    // Assumes lock is held
+    inline void internal_push( T &n )  {
+        my_q.push(&n);
+    }
+
+    // Assumes lock is held
+    inline T &internal_pop() {
+        T *v = my_q.front();
+        my_q.pop();
+        return *v;
+    }
+
+};
+
+//! A cache of predecessors that only supports try_get
+template< typename T, typename M=spin_mutex >
+class predecessor_cache : public node_cache< sender<T>, M > {
+public:
+    typedef M mutex_type;
+    typedef T output_type;
+    typedef sender<output_type> predecessor_type;
+    typedef receiver<output_type> successor_type;
+
+    predecessor_cache( successor_type* owner ) : my_owner( owner ) {
+        __TBB_ASSERT( my_owner, "predecessor_cache should have an owner." );
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+
+    bool get_item( output_type& v ) {
+
+        bool msg = false;
+
+        do {
+            predecessor_type *src;
+            {
+                typename mutex_type::scoped_lock lock(this->my_mutex);
+                if ( this->internal_empty() ) {
+                    break;
+                }
+                src = &this->internal_pop();
+            }
+
+            // Try to get from this sender
+            msg = src->try_get( v );
+
+            if (msg == false) {
+                // Relinquish ownership of the edge
+                register_successor(*src, *my_owner);
+            } else {
+                // Retain ownership of the edge
+                this->add(*src);
+            }
+        } while ( msg == false );
+        return msg;
+    }
+
+    // If we are removing arcs (rf_clear_edges), call clear() rather than reset().
+    void reset() {
+        for(;;) {
+            predecessor_type *src;
+            {
+                if (this->internal_empty()) break;
+                src = &this->internal_pop();
+            }
+            register_successor(*src, *my_owner);
+        }
+    }
+
+protected:
+    successor_type* my_owner;
+};
+
+//! An cache of predecessors that supports requests and reservations
+template< typename T, typename M=spin_mutex >
+class reservable_predecessor_cache : public predecessor_cache< T, M > {
+public:
+    typedef M mutex_type;
+    typedef T output_type;
+    typedef sender<T> predecessor_type;
+    typedef receiver<T> successor_type;
+
+    reservable_predecessor_cache( successor_type* owner )
+        : predecessor_cache<T,M>(owner), reserved_src(NULL)
+    {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+
+    bool
+    try_reserve( output_type &v ) {
+        bool msg = false;
+
+        do {
+            {
+                typename mutex_type::scoped_lock lock(this->my_mutex);
+                if ( reserved_src || this->internal_empty() )
+                    return false;
+
+                reserved_src = &this->internal_pop();
+            }
+
+            // Try to get from this sender
+            msg = reserved_src->try_reserve( v );
+
+            if (msg == false) {
+                typename mutex_type::scoped_lock lock(this->my_mutex);
+                // Relinquish ownership of the edge
+                register_successor( *reserved_src, *this->my_owner );
+                reserved_src = NULL;
+            } else {
+                // Retain ownership of the edge
+                this->add( *reserved_src );
+            }
+        } while ( msg == false );
+
+        return msg;
+    }
+
+    bool
+    try_release( ) {
+        reserved_src->try_release( );
+        reserved_src = NULL;
+        return true;
+    }
+
+    bool
+    try_consume( ) {
+        reserved_src->try_consume( );
+        reserved_src = NULL;
+        return true;
+    }
+
+    void reset( ) {
+        reserved_src = NULL;
+        predecessor_cache<T,M>::reset( );
+    }
+
+    void clear() {
+        reserved_src = NULL;
+        predecessor_cache<T,M>::clear();
+    }
+
+private:
+    predecessor_type *reserved_src;
+};
+
+
+//! An abstract cache of successors
+template<typename T, typename M=spin_rw_mutex >
+class successor_cache : no_copy {
+protected:
+
+    typedef M mutex_type;
+    mutex_type my_mutex;
+
+    typedef receiver<T> successor_type;
+    typedef receiver<T>* pointer_type;
+    typedef sender<T> owner_type;
+    // TODO revamp: introduce heapified collection of successors for strict priorities
+    typedef std::list< pointer_type > successors_type;
+    successors_type my_successors;
+
+    owner_type* my_owner;
+
+public:
+    successor_cache( owner_type* owner ) : my_owner(owner) {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+
+    virtual ~successor_cache() {}
+
+    void register_successor( successor_type& r ) {
+        typename mutex_type::scoped_lock l(my_mutex, true);
+        if( r.priority() != no_priority )
+            my_successors.push_front( &r );
+        else
+            my_successors.push_back( &r );
+    }
+
+    void remove_successor( successor_type& r ) {
+        typename mutex_type::scoped_lock l(my_mutex, true);
+        for ( typename successors_type::iterator i = my_successors.begin();
+              i != my_successors.end(); ++i ) {
+            if ( *i == & r ) {
+                my_successors.erase(i);
+                break;
+            }
+        }
+    }
+
+    bool empty() {
+        typename mutex_type::scoped_lock l(my_mutex, false);
+        return my_successors.empty();
+    }
+
+    void clear() {
+        my_successors.clear();
+    }
+
+    virtual graph_task* try_put_task( const T& t ) = 0;
+};  // successor_cache<T>
+
+//! An abstract cache of successors, specialized to continue_msg
+template<typename M>
+class successor_cache< continue_msg, M > : no_copy {
+protected:
+
+    typedef M mutex_type;
+    mutex_type my_mutex;
+
+    typedef receiver<continue_msg> successor_type;
+    typedef receiver<continue_msg>* pointer_type;
+    typedef sender<continue_msg> owner_type;
+    typedef std::list< pointer_type > successors_type;
+    successors_type my_successors;
+    owner_type* my_owner;
+
+public:
+    successor_cache( sender<continue_msg>* owner ) : my_owner(owner) {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+
+    virtual ~successor_cache() {}
+
+    void register_successor( successor_type& r ) {
+        typename mutex_type::scoped_lock l(my_mutex, true);
+        if( r.priority() != no_priority )
+            my_successors.push_front( &r );
+        else
+            my_successors.push_back( &r );
+        __TBB_ASSERT( my_owner, "Cache of successors must have an owner." );
+        if ( r.is_continue_receiver() ) {
+            r.register_predecessor( *my_owner );
+        }
+    }
+
+    void remove_successor( successor_type& r ) {
+        typename mutex_type::scoped_lock l(my_mutex, true);
+        for ( successors_type::iterator i = my_successors.begin(); i != my_successors.end(); ++i ) {
+            if ( *i == &r ) {
+                __TBB_ASSERT(my_owner, "Cache of successors must have an owner.");
+                // TODO: check if we need to test for continue_receiver before removing from r.
+                r.remove_predecessor( *my_owner );
+                my_successors.erase(i);
+                break;
+            }
+        }
+    }
+
+    bool empty() {
+        typename mutex_type::scoped_lock l(my_mutex, false);
+        return my_successors.empty();
+    }
+
+    void clear() {
+        my_successors.clear();
+    }
+
+    virtual graph_task* try_put_task( const continue_msg& t ) = 0;
+};  // successor_cache< continue_msg >
+
+//! A cache of successors that are broadcast to
+template<typename T, typename M=spin_rw_mutex>
+class broadcast_cache : public successor_cache<T, M> {
+    typedef successor_cache<T, M> base_type;
+    typedef M mutex_type;
+    typedef typename successor_cache<T,M>::successors_type successors_type;
+
+public:
+
+    broadcast_cache( typename base_type::owner_type* owner ): base_type(owner) {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+
+    // as above, but call try_put_task instead, and return the last task we received (if any)
+    graph_task* try_put_task( const T &t ) override {
+        graph_task * last_task = nullptr;
+        typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true);
+        typename successors_type::iterator i = this->my_successors.begin();
+        while ( i != this->my_successors.end() ) {
+            graph_task *new_task = (*i)->try_put_task(t);
+            // workaround for icc bug
+            graph& graph_ref = (*i)->graph_reference();
+            last_task = combine_tasks(graph_ref, last_task, new_task);  // enqueue if necessary
+            if(new_task) {
+                ++i;
+            }
+            else {  // failed
+                if ( (*i)->register_predecessor(*this->my_owner) ) {
+                    i = this->my_successors.erase(i);
+                } else {
+                    ++i;
+                }
+            }
+        }
+        return last_task;
+    }
+
+    // call try_put_task and return list of received tasks
+    bool gather_successful_try_puts( const T &t, graph_task_list& tasks ) {
+        bool is_at_least_one_put_successful = false;
+        typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true);
+        typename successors_type::iterator i = this->my_successors.begin();
+        while ( i != this->my_successors.end() ) {
+            graph_task * new_task = (*i)->try_put_task(t);
+            if(new_task) {
+                ++i;
+                if(new_task != SUCCESSFULLY_ENQUEUED) {
+                    tasks.push_back(*new_task);
+                }
+                is_at_least_one_put_successful = true;
+            }
+            else {  // failed
+                if ( (*i)->register_predecessor(*this->my_owner) ) {
+                    i = this->my_successors.erase(i);
+                } else {
+                    ++i;
+                }
+            }
+        }
+        return is_at_least_one_put_successful;
+    }
+};
+
+//! A cache of successors that are put in a round-robin fashion
+template<typename T, typename M=spin_rw_mutex >
+class round_robin_cache : public successor_cache<T, M> {
+    typedef successor_cache<T, M> base_type;
+    typedef size_t size_type;
+    typedef M mutex_type;
+    typedef typename successor_cache<T,M>::successors_type successors_type;
+
+public:
+
+    round_robin_cache( typename base_type::owner_type* owner ): base_type(owner) {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+
+    size_type size() {
+        typename mutex_type::scoped_lock l(this->my_mutex, false);
+        return this->my_successors.size();
+    }
+
+    graph_task* try_put_task( const T &t ) override {
+        typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true);
+        typename successors_type::iterator i = this->my_successors.begin();
+        while ( i != this->my_successors.end() ) {
+            graph_task* new_task = (*i)->try_put_task(t);
+            if ( new_task ) {
+                return new_task;
+            } else {
+               if ( (*i)->register_predecessor(*this->my_owner) ) {
+                   i = this->my_successors.erase(i);
+               }
+               else {
+                   ++i;
+               }
+            }
+        }
+        return NULL;
+    }
+};
+
+#endif // __TBB__flow_graph_cache_impl_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_impl.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_impl.h
new file mode 100644
index 0000000000..a3d17cfb1c
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_impl.h
@@ -0,0 +1,488 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_flow_graph_impl_H
+#define __TBB_flow_graph_impl_H
+
+// #include "../config.h"
+#include "_task.h"
+#include "tbb/task_group.h"
+#include "../task_arena.h"
+#include "../flow_graph_abstractions.h"
+
+#include "../concurrent_priority_queue.h"
+
+#include <list>
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+
+class graph_task;
+static graph_task* const SUCCESSFULLY_ENQUEUED = (graph_task*)-1;
+typedef unsigned int node_priority_t;
+static const node_priority_t no_priority = node_priority_t(0);
+
+class graph;
+class graph_node;
+
+template <typename GraphContainerType, typename GraphNodeType>
+class graph_iterator {
+    friend class graph;
+    friend class graph_node;
+public:
+    typedef size_t size_type;
+    typedef GraphNodeType value_type;
+    typedef GraphNodeType* pointer;
+    typedef GraphNodeType& reference;
+    typedef const GraphNodeType& const_reference;
+    typedef std::forward_iterator_tag iterator_category;
+
+    //! Copy constructor
+    graph_iterator(const graph_iterator& other) :
+        my_graph(other.my_graph), current_node(other.current_node)
+    {}
+
+    //! Assignment
+    graph_iterator& operator=(const graph_iterator& other) {
+        if (this != &other) {
+            my_graph = other.my_graph;
+            current_node = other.current_node;
+        }
+        return *this;
+    }
+
+    //! Dereference
+    reference operator*() const;
+
+    //! Dereference
+    pointer operator->() const;
+
+    //! Equality
+    bool operator==(const graph_iterator& other) const {
+        return ((my_graph == other.my_graph) && (current_node == other.current_node));
+    }
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+    //! Inequality
+    bool operator!=(const graph_iterator& other) const { return !(operator==(other)); }
+#endif
+
+    //! Pre-increment
+    graph_iterator& operator++() {
+        internal_forward();
+        return *this;
+    }
+
+    //! Post-increment
+    graph_iterator operator++(int) {
+        graph_iterator result = *this;
+        operator++();
+        return result;
+    }
+
+private:
+    // the graph over which we are iterating
+    GraphContainerType *my_graph;
+    // pointer into my_graph's my_nodes list
+    pointer current_node;
+
+    //! Private initializing constructor for begin() and end() iterators
+    graph_iterator(GraphContainerType *g, bool begin);
+    void internal_forward();
+};  // class graph_iterator
+
+// flags to modify the behavior of the graph reset().  Can be combined.
+enum reset_flags {
+    rf_reset_protocol = 0,
+    rf_reset_bodies = 1 << 0,  // delete the current node body, reset to a copy of the initial node body.
+    rf_clear_edges = 1 << 1   // delete edges
+};
+
+void activate_graph(graph& g);
+void deactivate_graph(graph& g);
+bool is_graph_active(graph& g);
+graph_task* prioritize_task(graph& g, graph_task& arena_task);
+void spawn_in_graph_arena(graph& g, graph_task& arena_task);
+void enqueue_in_graph_arena(graph &g, graph_task& arena_task);
+
+class graph;
+
+//! Base class for tasks generated by graph nodes.
+class graph_task : public task {
+public:
+    graph_task(graph& g, small_object_allocator& allocator
+               , node_priority_t node_priority = no_priority
+    )
+        : my_graph(g)
+        , priority(node_priority)
+        , my_allocator(allocator)
+    {}
+    graph& my_graph; // graph instance the task belongs to
+    // TODO revamp: rename to my_priority
+    node_priority_t priority;
+    void destruct_and_deallocate(const execution_data& ed);
+    task* cancel(execution_data& ed) override;
+protected:
+    void finalize(const execution_data& ed);
+private:
+    // To organize task_list
+    graph_task* my_next{ nullptr };
+    small_object_allocator my_allocator;
+    // TODO revamp: elaborate internal interfaces to avoid friends declarations
+    friend class graph_task_list;
+    friend graph_task* prioritize_task(graph& g, graph_task& gt);
+};
+
+struct graph_task_comparator {
+    bool operator()(const graph_task* left, const graph_task* right) {
+        return left->priority < right->priority;
+    }
+};
+
+typedef tbb::concurrent_priority_queue<graph_task*, graph_task_comparator> graph_task_priority_queue_t;
+
+class priority_task_selector : public task {
+public:
+    priority_task_selector(graph_task_priority_queue_t& priority_queue, small_object_allocator& allocator)
+        : my_priority_queue(priority_queue), my_allocator(allocator), my_task() {}
+    task* execute(execution_data& ed) override {
+        next_task();
+        __TBB_ASSERT(my_task, nullptr);
+        task* t_next = my_task->execute(ed);
+        my_allocator.delete_object(this, ed);
+        return t_next;
+    }
+    task* cancel(execution_data& ed) override {
+        if (!my_task) {
+            next_task();
+        }
+        __TBB_ASSERT(my_task, nullptr);
+        task* t_next = my_task->cancel(ed);
+        my_allocator.delete_object(this, ed);
+        return t_next;
+    }
+private:
+    void next_task() {
+        // TODO revamp: hold functors in priority queue instead of real tasks
+        bool result = my_priority_queue.try_pop(my_task);
+        __TBB_ASSERT_EX(result, "Number of critical tasks for scheduler and tasks"
+            " in graph's priority queue mismatched");
+        __TBB_ASSERT(my_task && my_task != SUCCESSFULLY_ENQUEUED,
+            "Incorrect task submitted to graph priority queue");
+        __TBB_ASSERT(my_task->priority != no_priority,
+            "Tasks from graph's priority queue must have priority");
+    }
+
+    graph_task_priority_queue_t& my_priority_queue;
+    small_object_allocator my_allocator;
+    graph_task* my_task;
+};
+
+template <typename Receiver, typename Body> class run_and_put_task;
+template <typename Body> class run_task;
+
+//********************************************************************************
+// graph tasks helpers
+//********************************************************************************
+
+//! The list of graph tasks
+class graph_task_list : no_copy {
+private:
+    graph_task* my_first;
+    graph_task** my_next_ptr;
+public:
+    //! Construct empty list
+    graph_task_list() : my_first(nullptr), my_next_ptr(&my_first) {}
+
+    //! True if list is empty; false otherwise.
+    bool empty() const { return !my_first; }
+
+    //! Push task onto back of list.
+    void push_back(graph_task& task) {
+        task.my_next = nullptr;
+        *my_next_ptr = &task;
+        my_next_ptr = &task.my_next;
+    }
+
+    //! Pop the front task from the list.
+    graph_task& pop_front() {
+        __TBB_ASSERT(!empty(), "attempt to pop item from empty task_list");
+        graph_task* result = my_first;
+        my_first = result->my_next;
+        if (!my_first) {
+            my_next_ptr = &my_first;
+        }
+        return *result;
+    }
+};
+
+//! The graph class
+/** This class serves as a handle to the graph */
+class graph : no_copy, public graph_proxy {
+    friend class graph_node;
+
+    void prepare_task_arena(bool reinit = false) {
+        if (reinit) {
+            __TBB_ASSERT(my_task_arena, "task arena is NULL");
+            my_task_arena->terminate();
+            my_task_arena->initialize(task_arena::attach());
+        }
+        else {
+            __TBB_ASSERT(my_task_arena == NULL, "task arena is not NULL");
+            my_task_arena = new task_arena(task_arena::attach());
+        }
+        if (!my_task_arena->is_active()) // failed to attach
+            my_task_arena->initialize(); // create a new, default-initialized arena
+        __TBB_ASSERT(my_task_arena->is_active(), "task arena is not active");
+    }
+
+public:
+    //! Constructs a graph with isolated task_group_context
+    graph();
+
+    //! Constructs a graph with use_this_context as context
+    explicit graph(task_group_context& use_this_context);
+
+    //! Destroys the graph.
+    /** Calls wait_for_all, then destroys the root task and context. */
+    ~graph();
+
+    //! Used to register that an external entity may still interact with the graph.
+    /** The graph will not return from wait_for_all until a matching number of release_wait calls is
+    made. */
+    void reserve_wait() override;
+
+    //! Deregisters an external entity that may have interacted with the graph.
+    /** The graph will not return from wait_for_all until all the number of reserve_wait calls
+    matches the number of release_wait calls. */
+    void release_wait() override;
+
+    //! Wait until graph is idle and the number of release_wait calls equals to the number of
+    //! reserve_wait calls.
+    /** The waiting thread will go off and steal work while it is blocked in the wait_for_all. */
+    void wait_for_all() {
+        cancelled = false;
+        caught_exception = false;
+        try_call([this] {
+            my_task_arena->execute([this] {
+                wait(my_wait_context, *my_context);
+            });
+            cancelled = my_context->is_group_execution_cancelled();
+        }).on_exception([this] {
+            my_context->reset();
+            caught_exception = true;
+            cancelled = true;
+        });
+        // TODO: the "if" condition below is just a work-around to support the concurrent wait
+        // mode. The cancellation and exception mechanisms are still broken in this mode.
+        // Consider using task group not to re-implement the same functionality.
+        if (!(my_context->traits() & task_group_context::concurrent_wait)) {
+            my_context->reset();  // consistent with behavior in catch()
+        }
+    }
+
+#if TODO_REVAMP
+#error Decide on ref_count() presence.
+    Its only use is in the template<typename T, typename BufferType> void test_resets()
+#endif
+
+#if __TBB_EXTRA_DEBUG
+    unsigned ref_count() const { return my_wait_context.reference_count(); }
+#endif
+
+
+    // TODO revamp: consider adding getter for task_group_context.
+
+    // ITERATORS
+    template<typename C, typename N>
+    friend class graph_iterator;
+
+    // Graph iterator typedefs
+    typedef graph_iterator<graph, graph_node> iterator;
+    typedef graph_iterator<const graph, const graph_node> const_iterator;
+
+    // Graph iterator constructors
+    //! start iterator
+    iterator begin();
+    //! end iterator
+    iterator end();
+    //! start const iterator
+    const_iterator begin() const;
+    //! end const iterator
+    const_iterator end() const;
+    //! start const iterator
+    const_iterator cbegin() const;
+    //! end const iterator
+    const_iterator cend() const;
+
+    // thread-unsafe state reset.
+    void reset(reset_flags f = rf_reset_protocol);
+
+    //! cancels execution of the associated task_group_context
+    void cancel();
+
+    //! return status of graph execution
+    bool is_cancelled() { return cancelled; }
+    bool exception_thrown() { return caught_exception; }
+
+private:
+    wait_context my_wait_context;
+    task_group_context *my_context;
+    bool own_context;
+    bool cancelled;
+    bool caught_exception;
+    bool my_is_active;
+
+    graph_node *my_nodes, *my_nodes_last;
+
+    tbb::spin_mutex nodelist_mutex;
+    void register_node(graph_node *n);
+    void remove_node(graph_node *n);
+
+    task_arena* my_task_arena;
+
+    graph_task_priority_queue_t my_priority_queue;
+
+    friend void activate_graph(graph& g);
+    friend void deactivate_graph(graph& g);
+    friend bool is_graph_active(graph& g);
+    friend graph_task* prioritize_task(graph& g, graph_task& arena_task);
+    friend void spawn_in_graph_arena(graph& g, graph_task& arena_task);
+    friend void enqueue_in_graph_arena(graph &g, graph_task& arena_task);
+
+    friend class task_arena_base;
+
+};  // class graph
+
+inline void graph_task::destruct_and_deallocate(const execution_data& ed) {
+    auto allocator = my_allocator;
+    // TODO: investigate if direct call of derived destructor gives any benefits.
+    this->~graph_task();
+    allocator.deallocate(this, ed);
+}
+
+inline void graph_task::finalize(const execution_data& ed) {
+    graph& g = my_graph;
+    destruct_and_deallocate(ed);
+    g.release_wait();
+}
+
+inline task* graph_task::cancel(execution_data& ed) {
+    finalize(ed);
+    return nullptr;
+}
+
+//********************************************************************************
+// end of graph tasks helpers
+//********************************************************************************
+
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+class get_graph_helper;
+#endif
+
+//! The base of all graph nodes.
+class graph_node : no_copy {
+    friend class graph;
+    template<typename C, typename N>
+    friend class graph_iterator;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    friend class get_graph_helper;
+#endif
+
+protected:
+    graph& my_graph;
+    graph& graph_reference() const {
+        // TODO revamp: propagate graph_reference() method to all the reference places.
+        return my_graph;
+    }
+    graph_node* next = nullptr;
+    graph_node* prev = nullptr;
+public:
+    explicit graph_node(graph& g);
+
+    virtual ~graph_node();
+
+protected:
+    // performs the reset on an individual node.
+    virtual void reset_node(reset_flags f = rf_reset_protocol) = 0;
+};  // class graph_node
+
+inline void activate_graph(graph& g) {
+    g.my_is_active = true;
+}
+
+inline void deactivate_graph(graph& g) {
+    g.my_is_active = false;
+}
+
+inline bool is_graph_active(graph& g) {
+    return g.my_is_active;
+}
+
+inline graph_task* prioritize_task(graph& g, graph_task& gt) {
+    if( no_priority == gt.priority )
+        return &gt;
+
+    //! Non-preemptive priority pattern. The original task is submitted as a work item to the
+    //! priority queue, and a new critical task is created to take and execute a work item with
+    //! the highest known priority. The reference counting responsibility is transferred (via
+    //! allocate_continuation) to the new task.
+    task* critical_task = gt.my_allocator.new_object<priority_task_selector>(g.my_priority_queue, gt.my_allocator);
+    __TBB_ASSERT( critical_task, "bad_alloc?" );
+    g.my_priority_queue.push(&gt);
+    using tbb::detail::d1::submit;
+    submit( *critical_task, *g.my_task_arena, *g.my_context, /*as_critical=*/true );
+    return nullptr;
+}
+
+//! Spawns a task inside graph arena
+inline void spawn_in_graph_arena(graph& g, graph_task& arena_task) {
+    if (is_graph_active(g)) {
+        task* gt = prioritize_task(g, arena_task);
+        if( !gt )
+            return;
+
+        __TBB_ASSERT(g.my_task_arena && g.my_task_arena->is_active(), NULL);
+        submit( *gt, *g.my_task_arena, *g.my_context
+#if __TBB_PREVIEW_CRITICAL_TASKS
+                , /*as_critical=*/false
+#endif
+        );
+    }
+}
+
+// TODO revamp: unify *_in_graph_arena functions
+
+//! Enqueues a task inside graph arena
+inline void enqueue_in_graph_arena(graph &g, graph_task& arena_task) {
+    if (is_graph_active(g)) {
+        __TBB_ASSERT( g.my_task_arena && g.my_task_arena->is_active(), "Is graph's arena initialized and active?" );
+
+        // TODO revamp: decide on the approach that does not postpone critical task
+        if( task* gt = prioritize_task(g, arena_task) )
+            submit( *gt, *g.my_task_arena, *g.my_context, /*as_critical=*/false);
+    }
+}
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_flow_graph_impl_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_indexer_impl.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_indexer_impl.h
new file mode 100644
index 0000000000..f4f55a6c7a
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_indexer_impl.h
@@ -0,0 +1,351 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__flow_graph_indexer_impl_H
+#define __TBB__flow_graph_indexer_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+// included in namespace tbb::detail::d1
+
+#include "_flow_graph_types_impl.h"
+
+    // Output of the indexer_node is a tbb::flow::tagged_msg, and will be of
+    // the form  tagged_msg<tag, result>
+    // where the value of tag will indicate which result was put to the
+    // successor.
+
+    template<typename IndexerNodeBaseType, typename T, size_t K>
+    graph_task* do_try_put(const T &v, void *p) {
+        typename IndexerNodeBaseType::output_type o(K, v);
+        return reinterpret_cast<IndexerNodeBaseType *>(p)->try_put_task(&o);
+    }
+
+    template<typename TupleTypes,int N>
+    struct indexer_helper {
+        template<typename IndexerNodeBaseType, typename PortTuple>
+        static inline void set_indexer_node_pointer(PortTuple &my_input, IndexerNodeBaseType *p, graph& g) {
+            typedef typename std::tuple_element<N-1, TupleTypes>::type T;
+            graph_task* (*indexer_node_put_task)(const T&, void *) = do_try_put<IndexerNodeBaseType, T, N-1>;
+            std::get<N-1>(my_input).set_up(p, indexer_node_put_task, g);
+            indexer_helper<TupleTypes,N-1>::template set_indexer_node_pointer<IndexerNodeBaseType,PortTuple>(my_input, p, g);
+        }
+    };
+
+    template<typename TupleTypes>
+    struct indexer_helper<TupleTypes,1> {
+        template<typename IndexerNodeBaseType, typename PortTuple>
+        static inline void set_indexer_node_pointer(PortTuple &my_input, IndexerNodeBaseType *p, graph& g) {
+            typedef typename std::tuple_element<0, TupleTypes>::type T;
+            graph_task* (*indexer_node_put_task)(const T&, void *) = do_try_put<IndexerNodeBaseType, T, 0>;
+            std::get<0>(my_input).set_up(p, indexer_node_put_task, g);
+        }
+    };
+
+    template<typename T>
+    class indexer_input_port : public receiver<T> {
+    private:
+        void* my_indexer_ptr;
+        typedef graph_task* (* forward_function_ptr)(T const &, void* );
+        forward_function_ptr my_try_put_task;
+        graph* my_graph;
+    public:
+        void set_up(void* p, forward_function_ptr f, graph& g) {
+            my_indexer_ptr = p;
+            my_try_put_task = f;
+            my_graph = &g;
+        }
+
+    protected:
+        template< typename R, typename B > friend class run_and_put_task;
+        template<typename X, typename Y> friend class broadcast_cache;
+        template<typename X, typename Y> friend class round_robin_cache;
+        graph_task* try_put_task(const T &v) override {
+            return my_try_put_task(v, my_indexer_ptr);
+        }
+
+        graph& graph_reference() const override {
+            return *my_graph;
+        }
+    };
+
+    template<typename InputTuple, typename OutputType, typename StructTypes>
+    class indexer_node_FE {
+    public:
+        static const int N = std::tuple_size<InputTuple>::value;
+        typedef OutputType output_type;
+        typedef InputTuple input_type;
+
+        // Some versions of Intel(R) C++ Compiler fail to generate an implicit constructor for the class which has std::tuple as a member.
+        indexer_node_FE() : my_inputs() {}
+
+        input_type &input_ports() { return my_inputs; }
+    protected:
+        input_type my_inputs;
+    };
+
+    //! indexer_node_base
+    template<typename InputTuple, typename OutputType, typename StructTypes>
+    class indexer_node_base : public graph_node, public indexer_node_FE<InputTuple, OutputType,StructTypes>,
+                           public sender<OutputType> {
+    protected:
+       using graph_node::my_graph;
+    public:
+        static const size_t N = std::tuple_size<InputTuple>::value;
+        typedef OutputType output_type;
+        typedef StructTypes tuple_types;
+        typedef typename sender<output_type>::successor_type successor_type;
+        typedef indexer_node_FE<InputTuple, output_type,StructTypes> input_ports_type;
+
+    private:
+        // ----------- Aggregator ------------
+        enum op_type { reg_succ, rem_succ, try__put_task
+        };
+        typedef indexer_node_base<InputTuple,output_type,StructTypes> class_type;
+
+        class indexer_node_base_operation : public aggregated_operation<indexer_node_base_operation> {
+        public:
+            char type;
+            union {
+                output_type const *my_arg;
+                successor_type *my_succ;
+                graph_task* bypass_t;
+            };
+            indexer_node_base_operation(const output_type* e, op_type t) :
+                type(char(t)), my_arg(e) {}
+            indexer_node_base_operation(const successor_type &s, op_type t) : type(char(t)),
+                my_succ(const_cast<successor_type *>(&s)) {}
+        };
+
+        typedef aggregating_functor<class_type, indexer_node_base_operation> handler_type;
+        friend class aggregating_functor<class_type, indexer_node_base_operation>;
+        aggregator<handler_type, indexer_node_base_operation> my_aggregator;
+
+        void handle_operations(indexer_node_base_operation* op_list) {
+            indexer_node_base_operation *current;
+            while(op_list) {
+                current = op_list;
+                op_list = op_list->next;
+                switch(current->type) {
+
+                case reg_succ:
+                    my_successors.register_successor(*(current->my_succ));
+                    current->status.store( SUCCEEDED, std::memory_order_release);
+                    break;
+
+                case rem_succ:
+                    my_successors.remove_successor(*(current->my_succ));
+                    current->status.store( SUCCEEDED, std::memory_order_release);
+                    break;
+                case try__put_task: {
+                        current->bypass_t = my_successors.try_put_task(*(current->my_arg));
+                        current->status.store( SUCCEEDED, std::memory_order_release);  // return of try_put_task actual return value
+                    }
+                    break;
+                }
+            }
+        }
+        // ---------- end aggregator -----------
+    public:
+        indexer_node_base(graph& g) : graph_node(g), input_ports_type(), my_successors(this) {
+            indexer_helper<StructTypes,N>::set_indexer_node_pointer(this->my_inputs, this, g);
+            my_aggregator.initialize_handler(handler_type(this));
+        }
+
+        indexer_node_base(const indexer_node_base& other)
+            : graph_node(other.my_graph), input_ports_type(), sender<output_type>(), my_successors(this)
+        {
+            indexer_helper<StructTypes,N>::set_indexer_node_pointer(this->my_inputs, this, other.my_graph);
+            my_aggregator.initialize_handler(handler_type(this));
+        }
+
+        bool register_successor(successor_type &r) override {
+            indexer_node_base_operation op_data(r, reg_succ);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        bool remove_successor( successor_type &r) override {
+            indexer_node_base_operation op_data(r, rem_succ);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        graph_task* try_put_task(output_type const *v) { // not a virtual method in this class
+            indexer_node_base_operation op_data(v, try__put_task);
+            my_aggregator.execute(&op_data);
+            return op_data.bypass_t;
+        }
+
+    protected:
+        void reset_node(reset_flags f) override {
+            if(f & rf_clear_edges) {
+                my_successors.clear();
+            }
+        }
+
+    private:
+        broadcast_cache<output_type, null_rw_mutex> my_successors;
+    };  //indexer_node_base
+
+
+    template<int N, typename InputTuple> struct input_types;
+
+    template<typename InputTuple>
+    struct input_types<1, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef tagged_msg<size_t, first_type > type;
+    };
+
+    template<typename InputTuple>
+    struct input_types<2, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef tagged_msg<size_t, first_type, second_type> type;
+    };
+
+    template<typename InputTuple>
+    struct input_types<3, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type> type;
+    };
+
+    template<typename InputTuple>
+    struct input_types<4, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef typename std::tuple_element<3, InputTuple>::type fourth_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type,
+                                                      fourth_type> type;
+    };
+
+    template<typename InputTuple>
+    struct input_types<5, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef typename std::tuple_element<3, InputTuple>::type fourth_type;
+        typedef typename std::tuple_element<4, InputTuple>::type fifth_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type,
+                                                      fourth_type, fifth_type> type;
+    };
+
+    template<typename InputTuple>
+    struct input_types<6, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef typename std::tuple_element<3, InputTuple>::type fourth_type;
+        typedef typename std::tuple_element<4, InputTuple>::type fifth_type;
+        typedef typename std::tuple_element<5, InputTuple>::type sixth_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type,
+                                                      fourth_type, fifth_type, sixth_type> type;
+    };
+
+    template<typename InputTuple>
+    struct input_types<7, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef typename std::tuple_element<3, InputTuple>::type fourth_type;
+        typedef typename std::tuple_element<4, InputTuple>::type fifth_type;
+        typedef typename std::tuple_element<5, InputTuple>::type sixth_type;
+        typedef typename std::tuple_element<6, InputTuple>::type seventh_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type,
+                                                      fourth_type, fifth_type, sixth_type,
+                                                      seventh_type> type;
+    };
+
+
+    template<typename InputTuple>
+    struct input_types<8, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef typename std::tuple_element<3, InputTuple>::type fourth_type;
+        typedef typename std::tuple_element<4, InputTuple>::type fifth_type;
+        typedef typename std::tuple_element<5, InputTuple>::type sixth_type;
+        typedef typename std::tuple_element<6, InputTuple>::type seventh_type;
+        typedef typename std::tuple_element<7, InputTuple>::type eighth_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type,
+                                                      fourth_type, fifth_type, sixth_type,
+                                                      seventh_type, eighth_type> type;
+    };
+
+
+    template<typename InputTuple>
+    struct input_types<9, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef typename std::tuple_element<3, InputTuple>::type fourth_type;
+        typedef typename std::tuple_element<4, InputTuple>::type fifth_type;
+        typedef typename std::tuple_element<5, InputTuple>::type sixth_type;
+        typedef typename std::tuple_element<6, InputTuple>::type seventh_type;
+        typedef typename std::tuple_element<7, InputTuple>::type eighth_type;
+        typedef typename std::tuple_element<8, InputTuple>::type nineth_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type,
+                                                      fourth_type, fifth_type, sixth_type,
+                                                      seventh_type, eighth_type, nineth_type> type;
+    };
+
+    template<typename InputTuple>
+    struct input_types<10, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef typename std::tuple_element<3, InputTuple>::type fourth_type;
+        typedef typename std::tuple_element<4, InputTuple>::type fifth_type;
+        typedef typename std::tuple_element<5, InputTuple>::type sixth_type;
+        typedef typename std::tuple_element<6, InputTuple>::type seventh_type;
+        typedef typename std::tuple_element<7, InputTuple>::type eighth_type;
+        typedef typename std::tuple_element<8, InputTuple>::type nineth_type;
+        typedef typename std::tuple_element<9, InputTuple>::type tenth_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type,
+                                                      fourth_type, fifth_type, sixth_type,
+                                                      seventh_type, eighth_type, nineth_type,
+                                                      tenth_type> type;
+    };
+
+    // type generators
+    template<typename OutputTuple>
+    struct indexer_types : public input_types<std::tuple_size<OutputTuple>::value, OutputTuple> {
+        static const int N = std::tuple_size<OutputTuple>::value;
+        typedef typename input_types<N, OutputTuple>::type output_type;
+        typedef typename wrap_tuple_elements<N,indexer_input_port,OutputTuple>::type input_ports_type;
+        typedef indexer_node_FE<input_ports_type,output_type,OutputTuple> indexer_FE_type;
+        typedef indexer_node_base<input_ports_type, output_type, OutputTuple> indexer_base_type;
+    };
+
+    template<class OutputTuple>
+    class unfolded_indexer_node : public indexer_types<OutputTuple>::indexer_base_type {
+    public:
+        typedef typename indexer_types<OutputTuple>::input_ports_type input_ports_type;
+        typedef OutputTuple tuple_types;
+        typedef typename indexer_types<OutputTuple>::output_type output_type;
+    private:
+        typedef typename indexer_types<OutputTuple>::indexer_base_type base_type;
+    public:
+        unfolded_indexer_node(graph& g) : base_type(g) {}
+        unfolded_indexer_node(const unfolded_indexer_node &other) : base_type(other) {}
+    };
+
+#endif  /* __TBB__flow_graph_indexer_impl_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_item_buffer_impl.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_item_buffer_impl.h
new file mode 100644
index 0000000000..4466bf4180
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_item_buffer_impl.h
@@ -0,0 +1,279 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__flow_graph_item_buffer_impl_H
+#define __TBB__flow_graph_item_buffer_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+#include "_aligned_space.h"
+
+// in namespace tbb::flow::interfaceX (included in _flow_graph_node_impl.h)
+
+//! Expandable buffer of items.  The possible operations are push, pop,
+//* tests for empty and so forth.  No mutual exclusion is built in.
+//* objects are constructed into and explicitly-destroyed.  get_my_item gives
+// a read-only reference to the item in the buffer.  set_my_item may be called
+// with either an empty or occupied slot.
+
+template <typename T, typename A=cache_aligned_allocator<T> >
+class item_buffer {
+public:
+    typedef T item_type;
+    enum buffer_item_state { no_item=0, has_item=1, reserved_item=2 };
+protected:
+    typedef size_t size_type;
+    typedef std::pair<item_type, buffer_item_state> aligned_space_item;
+    typedef aligned_space<aligned_space_item> buffer_item_type;
+    typedef typename allocator_traits<A>::template rebind_alloc<buffer_item_type> allocator_type;
+    buffer_item_type *my_array;
+    size_type my_array_size;
+    static const size_type initial_buffer_size = 4;
+    size_type my_head;
+    size_type my_tail;
+
+    bool buffer_empty() const { return my_head == my_tail; }
+
+    aligned_space_item &item(size_type i) {
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->second))%alignment_of<buffer_item_state>::value),NULL);
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->first))%alignment_of<item_type>::value), NULL);
+        return *my_array[i & (my_array_size - 1) ].begin();
+    }
+
+    const aligned_space_item &item(size_type i) const {
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->second))%alignment_of<buffer_item_state>::value), NULL);
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->first))%alignment_of<item_type>::value), NULL);
+        return *my_array[i & (my_array_size-1)].begin();
+    }
+
+    bool my_item_valid(size_type i) const { return (i < my_tail) && (i >= my_head) && (item(i).second != no_item); }
+#if TBB_USE_ASSERT
+    bool my_item_reserved(size_type i) const { return item(i).second == reserved_item; }
+#endif
+
+    // object management in buffer
+    const item_type &get_my_item(size_t i) const {
+        __TBB_ASSERT(my_item_valid(i),"attempt to get invalid item");
+        item_type* itm = const_cast<item_type*>(reinterpret_cast<const item_type*>(&item(i).first));
+        return *itm;
+    }
+
+    // may be called with an empty slot or a slot that has already been constructed into.
+    void set_my_item(size_t i, const item_type &o) {
+        if(item(i).second != no_item) {
+            destroy_item(i);
+        }
+        new(&(item(i).first)) item_type(o);
+        item(i).second = has_item;
+    }
+
+    // destructively-fetch an object from the buffer
+    void fetch_item(size_t i, item_type &o) {
+        __TBB_ASSERT(my_item_valid(i), "Trying to fetch an empty slot");
+        o = get_my_item(i);  // could have std::move assign semantics
+        destroy_item(i);
+    }
+
+    // move an existing item from one slot to another.  The moved-to slot must be unoccupied,
+    // the moved-from slot must exist and not be reserved.  The after, from will be empty,
+    // to will be occupied but not reserved
+    void move_item(size_t to, size_t from) {
+        __TBB_ASSERT(!my_item_valid(to), "Trying to move to a non-empty slot");
+        __TBB_ASSERT(my_item_valid(from), "Trying to move from an empty slot");
+        set_my_item(to, get_my_item(from));   // could have std::move semantics
+        destroy_item(from);
+
+    }
+
+    // put an item in an empty slot.  Return true if successful, else false
+    bool place_item(size_t here, const item_type &me) {
+#if !TBB_DEPRECATED_SEQUENCER_DUPLICATES
+        if(my_item_valid(here)) return false;
+#endif
+        set_my_item(here, me);
+        return true;
+    }
+
+    // could be implemented with std::move semantics
+    void swap_items(size_t i, size_t j) {
+        __TBB_ASSERT(my_item_valid(i) && my_item_valid(j), "attempt to swap invalid item(s)");
+        item_type temp = get_my_item(i);
+        set_my_item(i, get_my_item(j));
+        set_my_item(j, temp);
+    }
+
+    void destroy_item(size_type i) {
+        __TBB_ASSERT(my_item_valid(i), "destruction of invalid item");
+        item(i).first.~item_type();
+        item(i).second = no_item;
+    }
+
+    // returns the front element
+    const item_type& front() const
+    {
+        __TBB_ASSERT(my_item_valid(my_head), "attempt to fetch head non-item");
+        return get_my_item(my_head);
+    }
+
+    // returns  the back element
+    const item_type& back() const
+    {
+        __TBB_ASSERT(my_item_valid(my_tail - 1), "attempt to fetch head non-item");
+        return get_my_item(my_tail - 1);
+    }
+
+    // following methods are for reservation of the front of a buffer.
+    void reserve_item(size_type i) { __TBB_ASSERT(my_item_valid(i) && !my_item_reserved(i), "item cannot be reserved"); item(i).second = reserved_item; }
+    void release_item(size_type i) { __TBB_ASSERT(my_item_reserved(i), "item is not reserved"); item(i).second = has_item; }
+
+    void destroy_front() { destroy_item(my_head); ++my_head; }
+    void destroy_back() { destroy_item(my_tail-1); --my_tail; }
+
+    // we have to be able to test against a new tail value without changing my_tail
+    // grow_array doesn't work if we change my_tail when the old array is too small
+    size_type size(size_t new_tail = 0) { return (new_tail ? new_tail : my_tail) - my_head; }
+    size_type capacity() { return my_array_size; }
+    // sequencer_node does not use this method, so we don't
+    // need a version that passes in the new_tail value.
+    bool buffer_full() { return size() >= capacity(); }
+
+    //! Grows the internal array.
+    void grow_my_array( size_t minimum_size ) {
+        // test that we haven't made the structure inconsistent.
+        __TBB_ASSERT(capacity() >= my_tail - my_head, "total items exceed capacity");
+        size_type new_size = my_array_size ? 2*my_array_size : initial_buffer_size;
+        while( new_size<minimum_size )
+            new_size*=2;
+
+        buffer_item_type* new_array = allocator_type().allocate(new_size);
+
+        // initialize validity to "no"
+        for( size_type i=0; i<new_size; ++i ) { new_array[i].begin()->second = no_item; }
+
+        for( size_type i=my_head; i<my_tail; ++i) {
+            if(my_item_valid(i)) {  // sequencer_node may have empty slots
+                // placement-new copy-construct; could be std::move
+                char *new_space = (char *)&(new_array[i&(new_size-1)].begin()->first);
+                (void)new(new_space) item_type(get_my_item(i));
+                new_array[i&(new_size-1)].begin()->second = item(i).second;
+            }
+        }
+
+        clean_up_buffer(/*reset_pointers*/false);
+
+        my_array = new_array;
+        my_array_size = new_size;
+    }
+
+    bool push_back(item_type &v) {
+        if(buffer_full()) {
+            grow_my_array(size() + 1);
+        }
+        set_my_item(my_tail, v);
+        ++my_tail;
+        return true;
+    }
+
+    bool pop_back(item_type &v) {
+        if (!my_item_valid(my_tail-1)) {
+            return false;
+        }
+        v = this->back();
+        destroy_back();
+        return true;
+    }
+
+    bool pop_front(item_type &v) {
+        if(!my_item_valid(my_head)) {
+            return false;
+        }
+        v = this->front();
+        destroy_front();
+        return true;
+    }
+
+    // This is used both for reset and for grow_my_array.  In the case of grow_my_array
+    // we want to retain the values of the head and tail.
+    void clean_up_buffer(bool reset_pointers) {
+        if (my_array) {
+            for( size_type i=my_head; i<my_tail; ++i ) {
+                if(my_item_valid(i))
+                    destroy_item(i);
+            }
+            allocator_type().deallocate(my_array,my_array_size);
+        }
+        my_array = NULL;
+        if(reset_pointers) {
+            my_head = my_tail = my_array_size = 0;
+        }
+    }
+
+public:
+    //! Constructor
+    item_buffer( ) : my_array(NULL), my_array_size(0),
+                     my_head(0), my_tail(0) {
+        grow_my_array(initial_buffer_size);
+    }
+
+    ~item_buffer() {
+        clean_up_buffer(/*reset_pointers*/true);
+    }
+
+    void reset() { clean_up_buffer(/*reset_pointers*/true); grow_my_array(initial_buffer_size); }
+
+};
+
+//! item_buffer with reservable front-end.  NOTE: if reserving, do not
+//* complete operation with pop_front(); use consume_front().
+//* No synchronization built-in.
+template<typename T, typename A=cache_aligned_allocator<T> >
+class reservable_item_buffer : public item_buffer<T, A> {
+protected:
+    using item_buffer<T, A>::my_item_valid;
+    using item_buffer<T, A>::my_head;
+
+public:
+    reservable_item_buffer() : item_buffer<T, A>(), my_reserved(false) {}
+    void reset() {my_reserved = false; item_buffer<T,A>::reset(); }
+protected:
+
+    bool reserve_front(T &v) {
+        if(my_reserved || !my_item_valid(this->my_head)) return false;
+        my_reserved = true;
+        // reserving the head
+        v = this->front();
+        this->reserve_item(this->my_head);
+        return true;
+    }
+
+    void consume_front() {
+        __TBB_ASSERT(my_reserved, "Attempt to consume a non-reserved item");
+        this->destroy_front();
+        my_reserved = false;
+    }
+
+    void release_front() {
+        __TBB_ASSERT(my_reserved, "Attempt to release a non-reserved item");
+        this->release_item(this->my_head);
+        my_reserved = false;
+    }
+
+    bool my_reserved;
+};
+
+#endif // __TBB__flow_graph_item_buffer_impl_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_join_impl.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_join_impl.h
new file mode 100644
index 0000000000..98b357cdbc
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_join_impl.h
@@ -0,0 +1,1706 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__flow_graph_join_impl_H
+#define __TBB__flow_graph_join_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+// included into namespace tbb::detail::d1
+
+    struct forwarding_base : no_assign {
+        forwarding_base(graph &g) : graph_ref(g) {}
+        virtual ~forwarding_base() {}
+        graph& graph_ref;
+    };
+
+    struct queueing_forwarding_base : forwarding_base {
+        using forwarding_base::forwarding_base;
+        // decrement_port_count may create a forwarding task.  If we cannot handle the task
+        // ourselves, ask decrement_port_count to deal with it.
+        virtual graph_task* decrement_port_count(bool handle_task) = 0;
+    };
+
+    struct reserving_forwarding_base : forwarding_base {
+        using forwarding_base::forwarding_base;
+        // decrement_port_count may create a forwarding task.  If we cannot handle the task
+        // ourselves, ask decrement_port_count to deal with it.
+        virtual graph_task* decrement_port_count() = 0;
+        virtual void increment_port_count() = 0;
+    };
+
+    // specialization that lets us keep a copy of the current_key for building results.
+    // KeyType can be a reference type.
+    template<typename KeyType>
+    struct matching_forwarding_base : public forwarding_base {
+        typedef typename std::decay<KeyType>::type current_key_type;
+        matching_forwarding_base(graph &g) : forwarding_base(g) { }
+        virtual graph_task* increment_key_count(current_key_type const & /*t*/) = 0;
+        current_key_type current_key; // so ports can refer to FE's desired items
+    };
+
+    template< int N >
+    struct join_helper {
+
+        template< typename TupleType, typename PortType >
+        static inline void set_join_node_pointer(TupleType &my_input, PortType *port) {
+            std::get<N-1>( my_input ).set_join_node_pointer(port);
+            join_helper<N-1>::set_join_node_pointer( my_input, port );
+        }
+        template< typename TupleType >
+        static inline void consume_reservations( TupleType &my_input ) {
+            std::get<N-1>( my_input ).consume();
+            join_helper<N-1>::consume_reservations( my_input );
+        }
+
+        template< typename TupleType >
+        static inline void release_my_reservation( TupleType &my_input ) {
+            std::get<N-1>( my_input ).release();
+        }
+
+        template <typename TupleType>
+        static inline void release_reservations( TupleType &my_input) {
+            join_helper<N-1>::release_reservations(my_input);
+            release_my_reservation(my_input);
+        }
+
+        template< typename InputTuple, typename OutputTuple >
+        static inline bool reserve( InputTuple &my_input, OutputTuple &out) {
+            if ( !std::get<N-1>( my_input ).reserve( std::get<N-1>( out ) ) ) return false;
+            if ( !join_helper<N-1>::reserve( my_input, out ) ) {
+                release_my_reservation( my_input );
+                return false;
+            }
+            return true;
+        }
+
+        template<typename InputTuple, typename OutputTuple>
+        static inline bool get_my_item( InputTuple &my_input, OutputTuple &out) {
+            bool res = std::get<N-1>(my_input).get_item(std::get<N-1>(out) ); // may fail
+            return join_helper<N-1>::get_my_item(my_input, out) && res;       // do get on other inputs before returning
+        }
+
+        template<typename InputTuple, typename OutputTuple>
+        static inline bool get_items(InputTuple &my_input, OutputTuple &out) {
+            return get_my_item(my_input, out);
+        }
+
+        template<typename InputTuple>
+        static inline void reset_my_port(InputTuple &my_input) {
+            join_helper<N-1>::reset_my_port(my_input);
+            std::get<N-1>(my_input).reset_port();
+        }
+
+        template<typename InputTuple>
+        static inline void reset_ports(InputTuple& my_input) {
+            reset_my_port(my_input);
+        }
+
+        template<typename InputTuple, typename KeyFuncTuple>
+        static inline void set_key_functors(InputTuple &my_input, KeyFuncTuple &my_key_funcs) {
+            std::get<N-1>(my_input).set_my_key_func(std::get<N-1>(my_key_funcs));
+            std::get<N-1>(my_key_funcs) = nullptr;
+            join_helper<N-1>::set_key_functors(my_input, my_key_funcs);
+        }
+
+        template< typename KeyFuncTuple>
+        static inline void copy_key_functors(KeyFuncTuple &my_inputs, KeyFuncTuple &other_inputs) {
+            __TBB_ASSERT(
+                std::get<N-1>(other_inputs).get_my_key_func(),
+                "key matching join node should not be instantiated without functors."
+            );
+            std::get<N-1>(my_inputs).set_my_key_func(std::get<N-1>(other_inputs).get_my_key_func()->clone());
+            join_helper<N-1>::copy_key_functors(my_inputs, other_inputs);
+        }
+
+        template<typename InputTuple>
+        static inline void reset_inputs(InputTuple &my_input, reset_flags f) {
+            join_helper<N-1>::reset_inputs(my_input, f);
+            std::get<N-1>(my_input).reset_receiver(f);
+        }
+    };  // join_helper<N>
+
+    template< >
+    struct join_helper<1> {
+
+        template< typename TupleType, typename PortType >
+        static inline void set_join_node_pointer(TupleType &my_input, PortType *port) {
+            std::get<0>( my_input ).set_join_node_pointer(port);
+        }
+
+        template< typename TupleType >
+        static inline void consume_reservations( TupleType &my_input ) {
+            std::get<0>( my_input ).consume();
+        }
+
+        template< typename TupleType >
+        static inline void release_my_reservation( TupleType &my_input ) {
+            std::get<0>( my_input ).release();
+        }
+
+        template<typename TupleType>
+        static inline void release_reservations( TupleType &my_input) {
+            release_my_reservation(my_input);
+        }
+
+        template< typename InputTuple, typename OutputTuple >
+        static inline bool reserve( InputTuple &my_input, OutputTuple &out) {
+            return std::get<0>( my_input ).reserve( std::get<0>( out ) );
+        }
+
+        template<typename InputTuple, typename OutputTuple>
+        static inline bool get_my_item( InputTuple &my_input, OutputTuple &out) {
+            return std::get<0>(my_input).get_item(std::get<0>(out));
+        }
+
+        template<typename InputTuple, typename OutputTuple>
+        static inline bool get_items(InputTuple &my_input, OutputTuple &out) {
+            return get_my_item(my_input, out);
+        }
+
+        template<typename InputTuple>
+        static inline void reset_my_port(InputTuple &my_input) {
+            std::get<0>(my_input).reset_port();
+        }
+
+        template<typename InputTuple>
+        static inline void reset_ports(InputTuple& my_input) {
+            reset_my_port(my_input);
+        }
+
+        template<typename InputTuple, typename KeyFuncTuple>
+        static inline void set_key_functors(InputTuple &my_input, KeyFuncTuple &my_key_funcs) {
+            std::get<0>(my_input).set_my_key_func(std::get<0>(my_key_funcs));
+            std::get<0>(my_key_funcs) = nullptr;
+        }
+
+        template< typename KeyFuncTuple>
+        static inline void copy_key_functors(KeyFuncTuple &my_inputs, KeyFuncTuple &other_inputs) {
+            __TBB_ASSERT(
+                std::get<0>(other_inputs).get_my_key_func(),
+                "key matching join node should not be instantiated without functors."
+            );
+            std::get<0>(my_inputs).set_my_key_func(std::get<0>(other_inputs).get_my_key_func()->clone());
+        }
+        template<typename InputTuple>
+        static inline void reset_inputs(InputTuple &my_input, reset_flags f) {
+            std::get<0>(my_input).reset_receiver(f);
+        }
+    };  // join_helper<1>
+
+    //! The two-phase join port
+    template< typename T >
+    class reserving_port : public receiver<T> {
+    public:
+        typedef T input_type;
+        typedef typename receiver<input_type>::predecessor_type predecessor_type;
+
+    private:
+        // ----------- Aggregator ------------
+        enum op_type { reg_pred, rem_pred, res_item, rel_res, con_res
+        };
+        typedef reserving_port<T> class_type;
+
+        class reserving_port_operation : public aggregated_operation<reserving_port_operation> {
+        public:
+            char type;
+            union {
+                T *my_arg;
+                predecessor_type *my_pred;
+            };
+            reserving_port_operation(const T& e, op_type t) :
+                type(char(t)), my_arg(const_cast<T*>(&e)) {}
+            reserving_port_operation(const predecessor_type &s, op_type t) : type(char(t)),
+                my_pred(const_cast<predecessor_type *>(&s)) {}
+            reserving_port_operation(op_type t) : type(char(t)) {}
+        };
+
+        typedef aggregating_functor<class_type, reserving_port_operation> handler_type;
+        friend class aggregating_functor<class_type, reserving_port_operation>;
+        aggregator<handler_type, reserving_port_operation> my_aggregator;
+
+        void handle_operations(reserving_port_operation* op_list) {
+            reserving_port_operation *current;
+            bool was_missing_predecessors = false;
+            while(op_list) {
+                current = op_list;
+                op_list = op_list->next;
+                switch(current->type) {
+                case reg_pred:
+                    was_missing_predecessors = my_predecessors.empty();
+                    my_predecessors.add(*(current->my_pred));
+                    if ( was_missing_predecessors ) {
+                        (void) my_join->decrement_port_count(); // may try to forward
+                    }
+                    current->status.store( SUCCEEDED, std::memory_order_release);
+                    break;
+                case rem_pred:
+                    if ( !my_predecessors.empty() ) {
+                        my_predecessors.remove(*(current->my_pred));
+                        if ( my_predecessors.empty() ) // was the last predecessor
+                            my_join->increment_port_count();
+                    }
+                    // TODO: consider returning failure if there were no predecessors to remove
+                    current->status.store( SUCCEEDED, std::memory_order_release );
+                    break;
+                case res_item:
+                    if ( reserved ) {
+                        current->status.store( FAILED, std::memory_order_release);
+                    }
+                    else if ( my_predecessors.try_reserve( *(current->my_arg) ) ) {
+                        reserved = true;
+                        current->status.store( SUCCEEDED, std::memory_order_release);
+                    } else {
+                        if ( my_predecessors.empty() ) {
+                            my_join->increment_port_count();
+                        }
+                        current->status.store( FAILED, std::memory_order_release);
+                    }
+                    break;
+                case rel_res:
+                    reserved = false;
+                    my_predecessors.try_release( );
+                    current->status.store( SUCCEEDED, std::memory_order_release);
+                    break;
+                case con_res:
+                    reserved = false;
+                    my_predecessors.try_consume( );
+                    current->status.store( SUCCEEDED, std::memory_order_release);
+                    break;
+                }
+            }
+        }
+
+    protected:
+        template< typename R, typename B > friend class run_and_put_task;
+        template<typename X, typename Y> friend class broadcast_cache;
+        template<typename X, typename Y> friend class round_robin_cache;
+        graph_task* try_put_task( const T & ) override {
+            return nullptr;
+        }
+
+        graph& graph_reference() const override {
+            return my_join->graph_ref;
+        }
+
+    public:
+
+        //! Constructor
+        reserving_port() : my_join(nullptr), my_predecessors(this), reserved(false) {
+            my_aggregator.initialize_handler(handler_type(this));
+        }
+
+        // copy constructor
+        reserving_port(const reserving_port& /* other */) = delete;
+
+        void set_join_node_pointer(reserving_forwarding_base *join) {
+            my_join = join;
+        }
+
+        //! Add a predecessor
+        bool register_predecessor( predecessor_type &src ) override {
+            reserving_port_operation op_data(src, reg_pred);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        //! Remove a predecessor
+        bool remove_predecessor( predecessor_type &src ) override {
+            reserving_port_operation op_data(src, rem_pred);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        //! Reserve an item from the port
+        bool reserve( T &v ) {
+            reserving_port_operation op_data(v, res_item);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        //! Release the port
+        void release( ) {
+            reserving_port_operation op_data(rel_res);
+            my_aggregator.execute(&op_data);
+        }
+
+        //! Complete use of the port
+        void consume( ) {
+            reserving_port_operation op_data(con_res);
+            my_aggregator.execute(&op_data);
+        }
+
+        void reset_receiver( reset_flags f) {
+            if(f & rf_clear_edges) my_predecessors.clear();
+            else
+            my_predecessors.reset();
+            reserved = false;
+            __TBB_ASSERT(!(f&rf_clear_edges) || my_predecessors.empty(), "port edges not removed");
+        }
+
+    private:
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+        friend class get_graph_helper;
+#endif
+
+        reserving_forwarding_base *my_join;
+        reservable_predecessor_cache< T, null_mutex > my_predecessors;
+        bool reserved;
+    };  // reserving_port
+
+    //! queueing join_port
+    template<typename T>
+    class queueing_port : public receiver<T>, public item_buffer<T> {
+    public:
+        typedef T input_type;
+        typedef typename receiver<input_type>::predecessor_type predecessor_type;
+        typedef queueing_port<T> class_type;
+
+    // ----------- Aggregator ------------
+    private:
+        enum op_type { get__item, res_port, try__put_task
+        };
+
+        class queueing_port_operation : public aggregated_operation<queueing_port_operation> {
+        public:
+            char type;
+            T my_val;
+            T* my_arg;
+            graph_task* bypass_t;
+            // constructor for value parameter
+            queueing_port_operation(const T& e, op_type t) :
+                type(char(t)), my_val(e)
+                , bypass_t(nullptr)
+            {}
+            // constructor for pointer parameter
+            queueing_port_operation(const T* p, op_type t) :
+                type(char(t)), my_arg(const_cast<T*>(p))
+                , bypass_t(nullptr)
+            {}
+            // constructor with no parameter
+            queueing_port_operation(op_type t) : type(char(t))
+                , bypass_t(nullptr)
+            {}
+        };
+
+        typedef aggregating_functor<class_type, queueing_port_operation> handler_type;
+        friend class aggregating_functor<class_type, queueing_port_operation>;
+        aggregator<handler_type, queueing_port_operation> my_aggregator;
+
+        void handle_operations(queueing_port_operation* op_list) {
+            queueing_port_operation *current;
+            bool was_empty;
+            while(op_list) {
+                current = op_list;
+                op_list = op_list->next;
+                switch(current->type) {
+                case try__put_task: {
+                        graph_task* rtask = nullptr;
+                        was_empty = this->buffer_empty();
+                        this->push_back(current->my_val);
+                        if (was_empty) rtask = my_join->decrement_port_count(false);
+                        else
+                            rtask = SUCCESSFULLY_ENQUEUED;
+                        current->bypass_t = rtask;
+                        current->status.store( SUCCEEDED, std::memory_order_release);
+                    }
+                    break;
+                case get__item:
+                    if(!this->buffer_empty()) {
+                        *(current->my_arg) = this->front();
+                        current->status.store( SUCCEEDED, std::memory_order_release);
+                    }
+                    else {
+                        current->status.store( FAILED, std::memory_order_release);
+                    }
+                    break;
+                case res_port:
+                    __TBB_ASSERT(this->my_item_valid(this->my_head), "No item to reset");
+                    this->destroy_front();
+                    if(this->my_item_valid(this->my_head)) {
+                        (void)my_join->decrement_port_count(true);
+                    }
+                    current->status.store( SUCCEEDED, std::memory_order_release);
+                    break;
+                }
+            }
+        }
+    // ------------ End Aggregator ---------------
+
+    protected:
+        template< typename R, typename B > friend class run_and_put_task;
+        template<typename X, typename Y> friend class broadcast_cache;
+        template<typename X, typename Y> friend class round_robin_cache;
+        graph_task* try_put_task(const T &v) override {
+            queueing_port_operation op_data(v, try__put_task);
+            my_aggregator.execute(&op_data);
+            __TBB_ASSERT(op_data.status == SUCCEEDED || !op_data.bypass_t, "inconsistent return from aggregator");
+            if(!op_data.bypass_t) return SUCCESSFULLY_ENQUEUED;
+            return op_data.bypass_t;
+        }
+
+        graph& graph_reference() const override {
+            return my_join->graph_ref;
+        }
+
+    public:
+
+        //! Constructor
+        queueing_port() : item_buffer<T>() {
+            my_join = nullptr;
+            my_aggregator.initialize_handler(handler_type(this));
+        }
+
+        //! copy constructor
+        queueing_port(const queueing_port& /* other */) = delete;
+
+        //! record parent for tallying available items
+        void set_join_node_pointer(queueing_forwarding_base *join) {
+            my_join = join;
+        }
+
+        bool get_item( T &v ) {
+            queueing_port_operation op_data(&v, get__item);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        // reset_port is called when item is accepted by successor, but
+        // is initiated by join_node.
+        void reset_port() {
+            queueing_port_operation op_data(res_port);
+            my_aggregator.execute(&op_data);
+            return;
+        }
+
+        void reset_receiver(reset_flags) {
+            item_buffer<T>::reset();
+        }
+
+    private:
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+        friend class get_graph_helper;
+#endif
+
+        queueing_forwarding_base *my_join;
+    };  // queueing_port
+
+#include "_flow_graph_tagged_buffer_impl.h"
+
+    template<typename K>
+    struct count_element {
+        K my_key;
+        size_t my_value;
+    };
+
+    // method to access the key in the counting table
+    // the ref has already been removed from K
+    template< typename K >
+    struct key_to_count_functor {
+        typedef count_element<K> table_item_type;
+        const K& operator()(const table_item_type& v) { return v.my_key; }
+    };
+
+    // the ports can have only one template parameter.  We wrap the types needed in
+    // a traits type
+    template< class TraitsType >
+    class key_matching_port :
+        public receiver<typename TraitsType::T>,
+        public hash_buffer< typename TraitsType::K, typename TraitsType::T, typename TraitsType::TtoK,
+                typename TraitsType::KHash > {
+    public:
+        typedef TraitsType traits;
+        typedef key_matching_port<traits> class_type;
+        typedef typename TraitsType::T input_type;
+        typedef typename TraitsType::K key_type;
+        typedef typename std::decay<key_type>::type noref_key_type;
+        typedef typename receiver<input_type>::predecessor_type predecessor_type;
+        typedef typename TraitsType::TtoK type_to_key_func_type;
+        typedef typename TraitsType::KHash hash_compare_type;
+        typedef hash_buffer< key_type, input_type, type_to_key_func_type, hash_compare_type > buffer_type;
+
+    private:
+// ----------- Aggregator ------------
+    private:
+        enum op_type { try__put, get__item, res_port
+        };
+
+        class key_matching_port_operation : public aggregated_operation<key_matching_port_operation> {
+        public:
+            char type;
+            input_type my_val;
+            input_type *my_arg;
+            // constructor for value parameter
+            key_matching_port_operation(const input_type& e, op_type t) :
+                type(char(t)), my_val(e) {}
+            // constructor for pointer parameter
+            key_matching_port_operation(const input_type* p, op_type t) :
+                type(char(t)), my_arg(const_cast<input_type*>(p)) {}
+            // constructor with no parameter
+            key_matching_port_operation(op_type t) : type(char(t)) {}
+        };
+
+        typedef aggregating_functor<class_type, key_matching_port_operation> handler_type;
+        friend class aggregating_functor<class_type, key_matching_port_operation>;
+        aggregator<handler_type, key_matching_port_operation> my_aggregator;
+
+        void handle_operations(key_matching_port_operation* op_list) {
+            key_matching_port_operation *current;
+            while(op_list) {
+                current = op_list;
+                op_list = op_list->next;
+                switch(current->type) {
+                case try__put: {
+                        bool was_inserted = this->insert_with_key(current->my_val);
+                        // return failure if a duplicate insertion occurs
+                        current->status.store( was_inserted ? SUCCEEDED : FAILED, std::memory_order_release);
+                    }
+                    break;
+                case get__item:
+                    // use current_key from FE for item
+                    if(!this->find_with_key(my_join->current_key, *(current->my_arg))) {
+                        __TBB_ASSERT(false, "Failed to find item corresponding to current_key.");
+                    }
+                    current->status.store( SUCCEEDED, std::memory_order_release);
+                    break;
+                case res_port:
+                    // use current_key from FE for item
+                    this->delete_with_key(my_join->current_key);
+                    current->status.store( SUCCEEDED, std::memory_order_release);
+                    break;
+                }
+            }
+        }
+// ------------ End Aggregator ---------------
+    protected:
+        template< typename R, typename B > friend class run_and_put_task;
+        template<typename X, typename Y> friend class broadcast_cache;
+        template<typename X, typename Y> friend class round_robin_cache;
+        graph_task* try_put_task(const input_type& v) override {
+            key_matching_port_operation op_data(v, try__put);
+            graph_task* rtask = nullptr;
+            my_aggregator.execute(&op_data);
+            if(op_data.status == SUCCEEDED) {
+                rtask = my_join->increment_key_count((*(this->get_key_func()))(v));  // may spawn
+                // rtask has to reflect the return status of the try_put
+                if(!rtask) rtask = SUCCESSFULLY_ENQUEUED;
+            }
+            return rtask;
+        }
+
+        graph& graph_reference() const override {
+            return my_join->graph_ref;
+        }
+
+    public:
+
+        key_matching_port() : receiver<input_type>(), buffer_type() {
+            my_join = nullptr;
+            my_aggregator.initialize_handler(handler_type(this));
+        }
+
+        // copy constructor
+        key_matching_port(const key_matching_port& /*other*/) = delete;
+#if __INTEL_COMPILER <= 2021
+        // Suppress superfluous diagnostic about virtual keyword absence in a destructor of an inherited
+        // class while the parent class has the virtual keyword for the destrocutor.
+        virtual
+#endif
+        ~key_matching_port() { }
+
+        void set_join_node_pointer(forwarding_base *join) {
+            my_join = dynamic_cast<matching_forwarding_base<key_type>*>(join);
+        }
+
+        void set_my_key_func(type_to_key_func_type *f) { this->set_key_func(f); }
+
+        type_to_key_func_type* get_my_key_func() { return this->get_key_func(); }
+
+        bool get_item( input_type &v ) {
+            // aggregator uses current_key from FE for Key
+            key_matching_port_operation op_data(&v, get__item);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        // reset_port is called when item is accepted by successor, but
+        // is initiated by join_node.
+        void reset_port() {
+            key_matching_port_operation op_data(res_port);
+            my_aggregator.execute(&op_data);
+            return;
+        }
+
+        void reset_receiver(reset_flags ) {
+            buffer_type::reset();
+        }
+
+    private:
+        // my_join forwarding base used to count number of inputs that
+        // received key.
+        matching_forwarding_base<key_type> *my_join;
+    };  // key_matching_port
+
+    using namespace graph_policy_namespace;
+
+    template<typename JP, typename InputTuple, typename OutputTuple>
+    class join_node_base;
+
+    //! join_node_FE : implements input port policy
+    template<typename JP, typename InputTuple, typename OutputTuple>
+    class join_node_FE;
+
+    template<typename InputTuple, typename OutputTuple>
+    class join_node_FE<reserving, InputTuple, OutputTuple> : public reserving_forwarding_base {
+    public:
+        static const int N = std::tuple_size<OutputTuple>::value;
+        typedef OutputTuple output_type;
+        typedef InputTuple input_type;
+        typedef join_node_base<reserving, InputTuple, OutputTuple> base_node_type; // for forwarding
+
+        join_node_FE(graph &g) : reserving_forwarding_base(g), my_node(nullptr) {
+            ports_with_no_inputs = N;
+            join_helper<N>::set_join_node_pointer(my_inputs, this);
+        }
+
+        join_node_FE(const join_node_FE& other) : reserving_forwarding_base((other.reserving_forwarding_base::graph_ref)), my_node(nullptr) {
+            ports_with_no_inputs = N;
+            join_helper<N>::set_join_node_pointer(my_inputs, this);
+        }
+
+        void set_my_node(base_node_type *new_my_node) { my_node = new_my_node; }
+
+       void increment_port_count() override {
+            ++ports_with_no_inputs;
+        }
+
+        // if all input_ports have predecessors, spawn forward to try and consume tuples
+        graph_task* decrement_port_count() override {
+            if(ports_with_no_inputs.fetch_sub(1) == 1) {
+                if(is_graph_active(this->graph_ref)) {
+                    small_object_allocator allocator{};
+                    typedef forward_task_bypass<base_node_type> task_type;
+                    graph_task* t = allocator.new_object<task_type>(graph_ref, allocator, *my_node);
+                    graph_ref.reserve_wait();
+                    spawn_in_graph_arena(this->graph_ref, *t);
+                }
+            }
+            return nullptr;
+        }
+
+        input_type &input_ports() { return my_inputs; }
+
+    protected:
+
+        void reset(  reset_flags f) {
+            // called outside of parallel contexts
+            ports_with_no_inputs = N;
+            join_helper<N>::reset_inputs(my_inputs, f);
+        }
+
+        // all methods on input ports should be called under mutual exclusion from join_node_base.
+
+        bool tuple_build_may_succeed() {
+            return !ports_with_no_inputs;
+        }
+
+        bool try_to_make_tuple(output_type &out) {
+            if(ports_with_no_inputs) return false;
+            return join_helper<N>::reserve(my_inputs, out);
+        }
+
+        void tuple_accepted() {
+            join_helper<N>::consume_reservations(my_inputs);
+        }
+        void tuple_rejected() {
+            join_helper<N>::release_reservations(my_inputs);
+        }
+
+        input_type my_inputs;
+        base_node_type *my_node;
+        std::atomic<std::size_t> ports_with_no_inputs;
+    };  // join_node_FE<reserving, ... >
+
+    template<typename InputTuple, typename OutputTuple>
+    class join_node_FE<queueing, InputTuple, OutputTuple> : public queueing_forwarding_base {
+    public:
+        static const int N = std::tuple_size<OutputTuple>::value;
+        typedef OutputTuple output_type;
+        typedef InputTuple input_type;
+        typedef join_node_base<queueing, InputTuple, OutputTuple> base_node_type; // for forwarding
+
+        join_node_FE(graph &g) : queueing_forwarding_base(g), my_node(nullptr) {
+            ports_with_no_items = N;
+            join_helper<N>::set_join_node_pointer(my_inputs, this);
+        }
+
+        join_node_FE(const join_node_FE& other) : queueing_forwarding_base((other.queueing_forwarding_base::graph_ref)), my_node(nullptr) {
+            ports_with_no_items = N;
+            join_helper<N>::set_join_node_pointer(my_inputs, this);
+        }
+
+        // needed for forwarding
+        void set_my_node(base_node_type *new_my_node) { my_node = new_my_node; }
+
+        void reset_port_count() {
+            ports_with_no_items = N;
+        }
+
+        // if all input_ports have items, spawn forward to try and consume tuples
+        graph_task* decrement_port_count(bool handle_task) override
+        {
+            if(ports_with_no_items.fetch_sub(1) == 1) {
+                if(is_graph_active(this->graph_ref)) {
+                    small_object_allocator allocator{};
+                    typedef forward_task_bypass<base_node_type> task_type;
+                    graph_task* t = allocator.new_object<task_type>(graph_ref, allocator, *my_node);
+                    graph_ref.reserve_wait();
+                    if( !handle_task )
+                        return t;
+                    spawn_in_graph_arena(this->graph_ref, *t);
+                }
+            }
+            return nullptr;
+        }
+
+        input_type &input_ports() { return my_inputs; }
+
+    protected:
+
+        void reset(  reset_flags f) {
+            reset_port_count();
+            join_helper<N>::reset_inputs(my_inputs, f );
+        }
+
+        // all methods on input ports should be called under mutual exclusion from join_node_base.
+
+        bool tuple_build_may_succeed() {
+            return !ports_with_no_items;
+        }
+
+        bool try_to_make_tuple(output_type &out) {
+            if(ports_with_no_items) return false;
+            return join_helper<N>::get_items(my_inputs, out);
+        }
+
+        void tuple_accepted() {
+            reset_port_count();
+            join_helper<N>::reset_ports(my_inputs);
+        }
+        void tuple_rejected() {
+            // nothing to do.
+        }
+
+        input_type my_inputs;
+        base_node_type *my_node;
+        std::atomic<std::size_t> ports_with_no_items;
+    };  // join_node_FE<queueing, ...>
+
+    // key_matching join front-end.
+    template<typename InputTuple, typename OutputTuple, typename K, typename KHash>
+    class join_node_FE<key_matching<K,KHash>, InputTuple, OutputTuple> : public matching_forwarding_base<K>,
+             // buffer of key value counts
+              public hash_buffer<   // typedefed below to key_to_count_buffer_type
+                  typename std::decay<K>::type&,        // force ref type on K
+                  count_element<typename std::decay<K>::type>,
+                  type_to_key_function_body<
+                      count_element<typename std::decay<K>::type>,
+                      typename std::decay<K>::type& >,
+                  KHash >,
+             // buffer of output items
+             public item_buffer<OutputTuple> {
+    public:
+        static const int N = std::tuple_size<OutputTuple>::value;
+        typedef OutputTuple output_type;
+        typedef InputTuple input_type;
+        typedef K key_type;
+        typedef typename std::decay<key_type>::type unref_key_type;
+        typedef KHash key_hash_compare;
+        // must use K without ref.
+        typedef count_element<unref_key_type> count_element_type;
+        // method that lets us refer to the key of this type.
+        typedef key_to_count_functor<unref_key_type> key_to_count_func;
+        typedef type_to_key_function_body< count_element_type, unref_key_type&> TtoK_function_body_type;
+        typedef type_to_key_function_body_leaf<count_element_type, unref_key_type&, key_to_count_func> TtoK_function_body_leaf_type;
+        // this is the type of the special table that keeps track of the number of discrete
+        // elements corresponding to each key that we've seen.
+        typedef hash_buffer< unref_key_type&, count_element_type, TtoK_function_body_type, key_hash_compare >
+                 key_to_count_buffer_type;
+        typedef item_buffer<output_type> output_buffer_type;
+        typedef join_node_base<key_matching<key_type,key_hash_compare>, InputTuple, OutputTuple> base_node_type; // for forwarding
+        typedef matching_forwarding_base<key_type> forwarding_base_type;
+
+// ----------- Aggregator ------------
+        // the aggregator is only needed to serialize the access to the hash table.
+        // and the output_buffer_type base class
+    private:
+        enum op_type { res_count, inc_count, may_succeed, try_make };
+        typedef join_node_FE<key_matching<key_type,key_hash_compare>, InputTuple, OutputTuple> class_type;
+
+        class key_matching_FE_operation : public aggregated_operation<key_matching_FE_operation> {
+        public:
+            char type;
+            unref_key_type my_val;
+            output_type* my_output;
+            graph_task* bypass_t;
+            // constructor for value parameter
+            key_matching_FE_operation(const unref_key_type& e , op_type t) : type(char(t)), my_val(e),
+                 my_output(nullptr), bypass_t(nullptr) {}
+            key_matching_FE_operation(output_type *p, op_type t) : type(char(t)), my_output(p), bypass_t(nullptr) {}
+            // constructor with no parameter
+            key_matching_FE_operation(op_type t) : type(char(t)), my_output(nullptr), bypass_t(nullptr) {}
+        };
+
+        typedef aggregating_functor<class_type, key_matching_FE_operation> handler_type;
+        friend class aggregating_functor<class_type, key_matching_FE_operation>;
+        aggregator<handler_type, key_matching_FE_operation> my_aggregator;
+
+        // called from aggregator, so serialized
+        // returns a task pointer if the a task would have been enqueued but we asked that
+        // it be returned.  Otherwise returns nullptr.
+        graph_task* fill_output_buffer(unref_key_type &t) {
+            output_type l_out;
+            graph_task* rtask = nullptr;
+            bool do_fwd = this->buffer_empty() && is_graph_active(this->graph_ref);
+            this->current_key = t;
+            this->delete_with_key(this->current_key);   // remove the key
+            if(join_helper<N>::get_items(my_inputs, l_out)) {  //  <== call back
+                this->push_back(l_out);
+                if(do_fwd) {  // we enqueue if receiving an item from predecessor, not if successor asks for item
+                    small_object_allocator allocator{};
+                    typedef forward_task_bypass<base_node_type> task_type;
+                    rtask = allocator.new_object<task_type>(this->graph_ref, allocator, *my_node);
+                    this->graph_ref.reserve_wait();
+                    do_fwd = false;
+                }
+                // retire the input values
+                join_helper<N>::reset_ports(my_inputs);  //  <== call back
+            }
+            else {
+                __TBB_ASSERT(false, "should have had something to push");
+            }
+            return rtask;
+        }
+
+        void handle_operations(key_matching_FE_operation* op_list) {
+            key_matching_FE_operation *current;
+            while(op_list) {
+                current = op_list;
+                op_list = op_list->next;
+                switch(current->type) {
+                case res_count:  // called from BE
+                    {
+                        this->destroy_front();
+                        current->status.store( SUCCEEDED, std::memory_order_release);
+                    }
+                    break;
+                case inc_count: {  // called from input ports
+                        count_element_type *p = 0;
+                        unref_key_type &t = current->my_val;
+                        if(!(this->find_ref_with_key(t,p))) {
+                            count_element_type ev;
+                            ev.my_key = t;
+                            ev.my_value = 0;
+                            this->insert_with_key(ev);
+                            bool found = this->find_ref_with_key(t, p);
+                            __TBB_ASSERT_EX(found, "should find key after inserting it");
+                        }
+                        if(++(p->my_value) == size_t(N)) {
+                            current->bypass_t = fill_output_buffer(t);
+                        }
+                    }
+                    current->status.store( SUCCEEDED, std::memory_order_release);
+                    break;
+                case may_succeed:  // called from BE
+                    current->status.store( this->buffer_empty() ? FAILED : SUCCEEDED, std::memory_order_release);
+                    break;
+                case try_make:  // called from BE
+                    if(this->buffer_empty()) {
+                        current->status.store( FAILED, std::memory_order_release);
+                    }
+                    else {
+                        *(current->my_output) = this->front();
+                        current->status.store( SUCCEEDED, std::memory_order_release);
+                    }
+                    break;
+                }
+            }
+        }
+// ------------ End Aggregator ---------------
+
+    public:
+        template<typename FunctionTuple>
+        join_node_FE(graph &g, FunctionTuple &TtoK_funcs) : forwarding_base_type(g), my_node(nullptr) {
+            join_helper<N>::set_join_node_pointer(my_inputs, this);
+            join_helper<N>::set_key_functors(my_inputs, TtoK_funcs);
+            my_aggregator.initialize_handler(handler_type(this));
+                    TtoK_function_body_type *cfb = new TtoK_function_body_leaf_type(key_to_count_func());
+            this->set_key_func(cfb);
+        }
+
+        join_node_FE(const join_node_FE& other) : forwarding_base_type((other.forwarding_base_type::graph_ref)), key_to_count_buffer_type(),
+        output_buffer_type() {
+            my_node = nullptr;
+            join_helper<N>::set_join_node_pointer(my_inputs, this);
+            join_helper<N>::copy_key_functors(my_inputs, const_cast<input_type &>(other.my_inputs));
+            my_aggregator.initialize_handler(handler_type(this));
+            TtoK_function_body_type *cfb = new TtoK_function_body_leaf_type(key_to_count_func());
+            this->set_key_func(cfb);
+        }
+
+        // needed for forwarding
+        void set_my_node(base_node_type *new_my_node) { my_node = new_my_node; }
+
+        void reset_port_count() {  // called from BE
+            key_matching_FE_operation op_data(res_count);
+            my_aggregator.execute(&op_data);
+            return;
+        }
+
+        // if all input_ports have items, spawn forward to try and consume tuples
+        // return a task if we are asked and did create one.
+        graph_task *increment_key_count(unref_key_type const & t) override {  // called from input_ports
+            key_matching_FE_operation op_data(t, inc_count);
+            my_aggregator.execute(&op_data);
+            return op_data.bypass_t;
+        }
+
+        input_type &input_ports() { return my_inputs; }
+
+    protected:
+
+        void reset(  reset_flags f ) {
+            // called outside of parallel contexts
+            join_helper<N>::reset_inputs(my_inputs, f);
+
+            key_to_count_buffer_type::reset();
+            output_buffer_type::reset();
+        }
+
+        // all methods on input ports should be called under mutual exclusion from join_node_base.
+
+        bool tuple_build_may_succeed() {  // called from back-end
+            key_matching_FE_operation op_data(may_succeed);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        // cannot lock while calling back to input_ports.  current_key will only be set
+        // and reset under the aggregator, so it will remain consistent.
+        bool try_to_make_tuple(output_type &out) {
+            key_matching_FE_operation op_data(&out,try_make);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        void tuple_accepted() {
+            reset_port_count();  // reset current_key after ports reset.
+        }
+
+        void tuple_rejected() {
+            // nothing to do.
+        }
+
+        input_type my_inputs;  // input ports
+        base_node_type *my_node;
+    }; // join_node_FE<key_matching<K,KHash>, InputTuple, OutputTuple>
+
+    //! join_node_base
+    template<typename JP, typename InputTuple, typename OutputTuple>
+    class join_node_base : public graph_node, public join_node_FE<JP, InputTuple, OutputTuple>,
+                           public sender<OutputTuple> {
+    protected:
+        using graph_node::my_graph;
+    public:
+        typedef OutputTuple output_type;
+
+        typedef typename sender<output_type>::successor_type successor_type;
+        typedef join_node_FE<JP, InputTuple, OutputTuple> input_ports_type;
+        using input_ports_type::tuple_build_may_succeed;
+        using input_ports_type::try_to_make_tuple;
+        using input_ports_type::tuple_accepted;
+        using input_ports_type::tuple_rejected;
+
+    private:
+        // ----------- Aggregator ------------
+        enum op_type { reg_succ, rem_succ, try__get, do_fwrd, do_fwrd_bypass
+        };
+        typedef join_node_base<JP,InputTuple,OutputTuple> class_type;
+
+        class join_node_base_operation : public aggregated_operation<join_node_base_operation> {
+        public:
+            char type;
+            union {
+                output_type *my_arg;
+                successor_type *my_succ;
+            };
+            graph_task* bypass_t;
+            join_node_base_operation(const output_type& e, op_type t) : type(char(t)),
+                my_arg(const_cast<output_type*>(&e)), bypass_t(nullptr) {}
+            join_node_base_operation(const successor_type &s, op_type t) : type(char(t)),
+                my_succ(const_cast<successor_type *>(&s)), bypass_t(nullptr) {}
+            join_node_base_operation(op_type t) : type(char(t)), bypass_t(nullptr) {}
+        };
+
+        typedef aggregating_functor<class_type, join_node_base_operation> handler_type;
+        friend class aggregating_functor<class_type, join_node_base_operation>;
+        bool forwarder_busy;
+        aggregator<handler_type, join_node_base_operation> my_aggregator;
+
+        void handle_operations(join_node_base_operation* op_list) {
+            join_node_base_operation *current;
+            while(op_list) {
+                current = op_list;
+                op_list = op_list->next;
+                switch(current->type) {
+                case reg_succ: {
+                        my_successors.register_successor(*(current->my_succ));
+                        if(tuple_build_may_succeed() && !forwarder_busy && is_graph_active(my_graph)) {
+                            small_object_allocator allocator{};
+                            typedef forward_task_bypass< join_node_base<JP, InputTuple, OutputTuple> > task_type;
+                            graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
+                            my_graph.reserve_wait();
+                            spawn_in_graph_arena(my_graph, *t);
+                            forwarder_busy = true;
+                        }
+                        current->status.store( SUCCEEDED, std::memory_order_release);
+                    }
+                    break;
+                case rem_succ:
+                    my_successors.remove_successor(*(current->my_succ));
+                    current->status.store( SUCCEEDED, std::memory_order_release);
+                    break;
+                case try__get:
+                    if(tuple_build_may_succeed()) {
+                        if(try_to_make_tuple(*(current->my_arg))) {
+                            tuple_accepted();
+                            current->status.store( SUCCEEDED, std::memory_order_release);
+                        }
+                        else current->status.store( FAILED, std::memory_order_release);
+                    }
+                    else current->status.store( FAILED, std::memory_order_release);
+                    break;
+                case do_fwrd_bypass: {
+                        bool build_succeeded;
+                        graph_task *last_task = nullptr;
+                        output_type out;
+                        // forwarding must be exclusive, because try_to_make_tuple and tuple_accepted
+                        // are separate locked methods in the FE.  We could conceivably fetch the front
+                        // of the FE queue, then be swapped out, have someone else consume the FE's
+                        // object, then come back, forward, and then try to remove it from the queue
+                        // again. Without reservation of the FE, the methods accessing it must be locked.
+                        // We could remember the keys of the objects we forwarded, and then remove
+                        // them from the input ports after forwarding is complete?
+                        if(tuple_build_may_succeed()) {  // checks output queue of FE
+                            do {
+                                build_succeeded = try_to_make_tuple(out);  // fetch front_end of queue
+                                if(build_succeeded) {
+                                    graph_task *new_task = my_successors.try_put_task(out);
+                                    last_task = combine_tasks(my_graph, last_task, new_task);
+                                    if(new_task) {
+                                        tuple_accepted();
+                                    }
+                                    else {
+                                        tuple_rejected();
+                                        build_succeeded = false;
+                                    }
+                                }
+                            } while(build_succeeded);
+                        }
+                        current->bypass_t = last_task;
+                        current->status.store( SUCCEEDED, std::memory_order_release);
+                        forwarder_busy = false;
+                    }
+                    break;
+                }
+            }
+        }
+        // ---------- end aggregator -----------
+    public:
+        join_node_base(graph &g)
+            : graph_node(g), input_ports_type(g), forwarder_busy(false), my_successors(this)
+        {
+            input_ports_type::set_my_node(this);
+            my_aggregator.initialize_handler(handler_type(this));
+        }
+
+        join_node_base(const join_node_base& other) :
+            graph_node(other.graph_node::my_graph), input_ports_type(other),
+            sender<OutputTuple>(), forwarder_busy(false), my_successors(this)
+        {
+            input_ports_type::set_my_node(this);
+            my_aggregator.initialize_handler(handler_type(this));
+        }
+
+        template<typename FunctionTuple>
+        join_node_base(graph &g, FunctionTuple f)
+            : graph_node(g), input_ports_type(g, f), forwarder_busy(false), my_successors(this)
+        {
+            input_ports_type::set_my_node(this);
+            my_aggregator.initialize_handler(handler_type(this));
+        }
+
+        bool register_successor(successor_type &r) override {
+            join_node_base_operation op_data(r, reg_succ);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        bool remove_successor( successor_type &r) override {
+            join_node_base_operation op_data(r, rem_succ);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        bool try_get( output_type &v) override {
+            join_node_base_operation op_data(v, try__get);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+    protected:
+        void reset_node(reset_flags f) override {
+            input_ports_type::reset(f);
+            if(f & rf_clear_edges) my_successors.clear();
+        }
+
+    private:
+        broadcast_cache<output_type, null_rw_mutex> my_successors;
+
+        friend class forward_task_bypass< join_node_base<JP, InputTuple, OutputTuple> >;
+        graph_task *forward_task() {
+            join_node_base_operation op_data(do_fwrd_bypass);
+            my_aggregator.execute(&op_data);
+            return op_data.bypass_t;
+        }
+
+    };  // join_node_base
+
+    // join base class type generator
+    template<int N, template<class> class PT, typename OutputTuple, typename JP>
+    struct join_base {
+        typedef join_node_base<JP, typename wrap_tuple_elements<N,PT,OutputTuple>::type, OutputTuple> type;
+    };
+
+    template<int N, typename OutputTuple, typename K, typename KHash>
+    struct join_base<N, key_matching_port, OutputTuple, key_matching<K,KHash> > {
+        typedef key_matching<K, KHash> key_traits_type;
+        typedef K key_type;
+        typedef KHash key_hash_compare;
+        typedef join_node_base< key_traits_type,
+                // ports type
+                typename wrap_key_tuple_elements<N,key_matching_port,key_traits_type,OutputTuple>::type,
+                OutputTuple > type;
+    };
+
+    //! unfolded_join_node : passes input_ports_type to join_node_base.  We build the input port type
+    //  using tuple_element.  The class PT is the port type (reserving_port, queueing_port, key_matching_port)
+    //  and should match the typename.
+
+    template<int N, template<class> class PT, typename OutputTuple, typename JP>
+    class unfolded_join_node : public join_base<N,PT,OutputTuple,JP>::type {
+    public:
+        typedef typename wrap_tuple_elements<N, PT, OutputTuple>::type input_ports_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<JP, input_ports_type, output_type > base_type;
+    public:
+        unfolded_join_node(graph &g) : base_type(g) {}
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+    template <typename K, typename T>
+    struct key_from_message_body {
+        K operator()(const T& t) const {
+            return key_from_message<K>(t);
+        }
+    };
+    // Adds const to reference type
+    template <typename K, typename T>
+    struct key_from_message_body<K&,T> {
+        const K& operator()(const T& t) const {
+            return key_from_message<const K&>(t);
+        }
+    };
+#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+    // key_matching unfolded_join_node.  This must be a separate specialization because the constructors
+    // differ.
+
+    template<typename OutputTuple, typename K, typename KHash>
+    class unfolded_join_node<2,key_matching_port,OutputTuple,key_matching<K,KHash> > : public
+            join_base<2,key_matching_port,OutputTuple,key_matching<K,KHash> >::type {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+    public:
+        typedef typename wrap_key_tuple_elements<2,key_matching_port,key_matching<K,KHash>,OutputTuple>::type input_ports_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<key_matching<K,KHash>, input_ports_type, output_type > base_type;
+        typedef type_to_key_function_body<T0, K> *f0_p;
+        typedef type_to_key_function_body<T1, K> *f1_p;
+        typedef std::tuple< f0_p, f1_p > func_initializer_type;
+    public:
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+        unfolded_join_node(graph &g) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, key_from_message_body<K,T0> >(key_from_message_body<K,T0>()),
+                    new type_to_key_function_body_leaf<T1, K, key_from_message_body<K,T1> >(key_from_message_body<K,T1>())
+                    ) ) {
+        }
+#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+        template<typename Body0, typename Body1>
+        unfolded_join_node(graph &g, Body0 body0, Body1 body1) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, Body0>(body0),
+                    new type_to_key_function_body_leaf<T1, K, Body1>(body1)
+                    ) ) {
+            static_assert(std::tuple_size<OutputTuple>::value == 2, "wrong number of body initializers");
+        }
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    template<typename OutputTuple, typename K, typename KHash>
+    class unfolded_join_node<3,key_matching_port,OutputTuple,key_matching<K,KHash> > : public
+            join_base<3,key_matching_port,OutputTuple,key_matching<K,KHash> >::type {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+    public:
+        typedef typename wrap_key_tuple_elements<3,key_matching_port,key_matching<K,KHash>,OutputTuple>::type input_ports_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<key_matching<K,KHash>, input_ports_type, output_type > base_type;
+        typedef type_to_key_function_body<T0, K> *f0_p;
+        typedef type_to_key_function_body<T1, K> *f1_p;
+        typedef type_to_key_function_body<T2, K> *f2_p;
+        typedef std::tuple< f0_p, f1_p, f2_p > func_initializer_type;
+    public:
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+        unfolded_join_node(graph &g) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, key_from_message_body<K,T0> >(key_from_message_body<K,T0>()),
+                    new type_to_key_function_body_leaf<T1, K, key_from_message_body<K,T1> >(key_from_message_body<K,T1>()),
+                    new type_to_key_function_body_leaf<T2, K, key_from_message_body<K,T2> >(key_from_message_body<K,T2>())
+                    ) ) {
+        }
+#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+        template<typename Body0, typename Body1, typename Body2>
+        unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, Body0>(body0),
+                    new type_to_key_function_body_leaf<T1, K, Body1>(body1),
+                    new type_to_key_function_body_leaf<T2, K, Body2>(body2)
+                    ) ) {
+            static_assert(std::tuple_size<OutputTuple>::value == 3, "wrong number of body initializers");
+        }
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    template<typename OutputTuple, typename K, typename KHash>
+    class unfolded_join_node<4,key_matching_port,OutputTuple,key_matching<K,KHash> > : public
+            join_base<4,key_matching_port,OutputTuple,key_matching<K,KHash> >::type {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+        typedef typename std::tuple_element<3, OutputTuple>::type T3;
+    public:
+        typedef typename wrap_key_tuple_elements<4,key_matching_port,key_matching<K,KHash>,OutputTuple>::type input_ports_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<key_matching<K,KHash>, input_ports_type, output_type > base_type;
+        typedef type_to_key_function_body<T0, K> *f0_p;
+        typedef type_to_key_function_body<T1, K> *f1_p;
+        typedef type_to_key_function_body<T2, K> *f2_p;
+        typedef type_to_key_function_body<T3, K> *f3_p;
+        typedef std::tuple< f0_p, f1_p, f2_p, f3_p > func_initializer_type;
+    public:
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+        unfolded_join_node(graph &g) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, key_from_message_body<K,T0> >(key_from_message_body<K,T0>()),
+                    new type_to_key_function_body_leaf<T1, K, key_from_message_body<K,T1> >(key_from_message_body<K,T1>()),
+                    new type_to_key_function_body_leaf<T2, K, key_from_message_body<K,T2> >(key_from_message_body<K,T2>()),
+                    new type_to_key_function_body_leaf<T3, K, key_from_message_body<K,T3> >(key_from_message_body<K,T3>())
+                    ) ) {
+        }
+#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+        template<typename Body0, typename Body1, typename Body2, typename Body3>
+        unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, Body0>(body0),
+                    new type_to_key_function_body_leaf<T1, K, Body1>(body1),
+                    new type_to_key_function_body_leaf<T2, K, Body2>(body2),
+                    new type_to_key_function_body_leaf<T3, K, Body3>(body3)
+                    ) ) {
+            static_assert(std::tuple_size<OutputTuple>::value == 4, "wrong number of body initializers");
+        }
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    template<typename OutputTuple, typename K, typename KHash>
+    class unfolded_join_node<5,key_matching_port,OutputTuple,key_matching<K,KHash> > : public
+            join_base<5,key_matching_port,OutputTuple,key_matching<K,KHash> >::type {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+        typedef typename std::tuple_element<3, OutputTuple>::type T3;
+        typedef typename std::tuple_element<4, OutputTuple>::type T4;
+    public:
+        typedef typename wrap_key_tuple_elements<5,key_matching_port,key_matching<K,KHash>,OutputTuple>::type input_ports_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<key_matching<K,KHash> , input_ports_type, output_type > base_type;
+        typedef type_to_key_function_body<T0, K> *f0_p;
+        typedef type_to_key_function_body<T1, K> *f1_p;
+        typedef type_to_key_function_body<T2, K> *f2_p;
+        typedef type_to_key_function_body<T3, K> *f3_p;
+        typedef type_to_key_function_body<T4, K> *f4_p;
+        typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p > func_initializer_type;
+    public:
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+        unfolded_join_node(graph &g) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, key_from_message_body<K,T0> >(key_from_message_body<K,T0>()),
+                    new type_to_key_function_body_leaf<T1, K, key_from_message_body<K,T1> >(key_from_message_body<K,T1>()),
+                    new type_to_key_function_body_leaf<T2, K, key_from_message_body<K,T2> >(key_from_message_body<K,T2>()),
+                    new type_to_key_function_body_leaf<T3, K, key_from_message_body<K,T3> >(key_from_message_body<K,T3>()),
+                    new type_to_key_function_body_leaf<T4, K, key_from_message_body<K,T4> >(key_from_message_body<K,T4>())
+                    ) ) {
+        }
+#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+        template<typename Body0, typename Body1, typename Body2, typename Body3, typename Body4>
+        unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, Body0>(body0),
+                    new type_to_key_function_body_leaf<T1, K, Body1>(body1),
+                    new type_to_key_function_body_leaf<T2, K, Body2>(body2),
+                    new type_to_key_function_body_leaf<T3, K, Body3>(body3),
+                    new type_to_key_function_body_leaf<T4, K, Body4>(body4)
+                    ) ) {
+            static_assert(std::tuple_size<OutputTuple>::value == 5, "wrong number of body initializers");
+        }
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+#if __TBB_VARIADIC_MAX >= 6
+    template<typename OutputTuple, typename K, typename KHash>
+    class unfolded_join_node<6,key_matching_port,OutputTuple,key_matching<K,KHash> > : public
+            join_base<6,key_matching_port,OutputTuple,key_matching<K,KHash> >::type {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+        typedef typename std::tuple_element<3, OutputTuple>::type T3;
+        typedef typename std::tuple_element<4, OutputTuple>::type T4;
+        typedef typename std::tuple_element<5, OutputTuple>::type T5;
+    public:
+        typedef typename wrap_key_tuple_elements<6,key_matching_port,key_matching<K,KHash>,OutputTuple>::type input_ports_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<key_matching<K,KHash> , input_ports_type, output_type > base_type;
+        typedef type_to_key_function_body<T0, K> *f0_p;
+        typedef type_to_key_function_body<T1, K> *f1_p;
+        typedef type_to_key_function_body<T2, K> *f2_p;
+        typedef type_to_key_function_body<T3, K> *f3_p;
+        typedef type_to_key_function_body<T4, K> *f4_p;
+        typedef type_to_key_function_body<T5, K> *f5_p;
+        typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p > func_initializer_type;
+    public:
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+        unfolded_join_node(graph &g) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, key_from_message_body<K,T0> >(key_from_message_body<K,T0>()),
+                    new type_to_key_function_body_leaf<T1, K, key_from_message_body<K,T1> >(key_from_message_body<K,T1>()),
+                    new type_to_key_function_body_leaf<T2, K, key_from_message_body<K,T2> >(key_from_message_body<K,T2>()),
+                    new type_to_key_function_body_leaf<T3, K, key_from_message_body<K,T3> >(key_from_message_body<K,T3>()),
+                    new type_to_key_function_body_leaf<T4, K, key_from_message_body<K,T4> >(key_from_message_body<K,T4>()),
+                    new type_to_key_function_body_leaf<T5, K, key_from_message_body<K,T5> >(key_from_message_body<K,T5>())
+                    ) ) {
+        }
+#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+        template<typename Body0, typename Body1, typename Body2, typename Body3, typename Body4, typename Body5>
+        unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4, Body5 body5)
+                : base_type(g, func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, Body0>(body0),
+                    new type_to_key_function_body_leaf<T1, K, Body1>(body1),
+                    new type_to_key_function_body_leaf<T2, K, Body2>(body2),
+                    new type_to_key_function_body_leaf<T3, K, Body3>(body3),
+                    new type_to_key_function_body_leaf<T4, K, Body4>(body4),
+                    new type_to_key_function_body_leaf<T5, K, Body5>(body5)
+                    ) ) {
+            static_assert(std::tuple_size<OutputTuple>::value == 6, "wrong number of body initializers");
+        }
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+#endif
+
+#if __TBB_VARIADIC_MAX >= 7
+    template<typename OutputTuple, typename K, typename KHash>
+    class unfolded_join_node<7,key_matching_port,OutputTuple,key_matching<K,KHash> > : public
+            join_base<7,key_matching_port,OutputTuple,key_matching<K,KHash> >::type {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+        typedef typename std::tuple_element<3, OutputTuple>::type T3;
+        typedef typename std::tuple_element<4, OutputTuple>::type T4;
+        typedef typename std::tuple_element<5, OutputTuple>::type T5;
+        typedef typename std::tuple_element<6, OutputTuple>::type T6;
+    public:
+        typedef typename wrap_key_tuple_elements<7,key_matching_port,key_matching<K,KHash>,OutputTuple>::type input_ports_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<key_matching<K,KHash> , input_ports_type, output_type > base_type;
+        typedef type_to_key_function_body<T0, K> *f0_p;
+        typedef type_to_key_function_body<T1, K> *f1_p;
+        typedef type_to_key_function_body<T2, K> *f2_p;
+        typedef type_to_key_function_body<T3, K> *f3_p;
+        typedef type_to_key_function_body<T4, K> *f4_p;
+        typedef type_to_key_function_body<T5, K> *f5_p;
+        typedef type_to_key_function_body<T6, K> *f6_p;
+        typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p, f6_p > func_initializer_type;
+    public:
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+        unfolded_join_node(graph &g) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, key_from_message_body<K,T0> >(key_from_message_body<K,T0>()),
+                    new type_to_key_function_body_leaf<T1, K, key_from_message_body<K,T1> >(key_from_message_body<K,T1>()),
+                    new type_to_key_function_body_leaf<T2, K, key_from_message_body<K,T2> >(key_from_message_body<K,T2>()),
+                    new type_to_key_function_body_leaf<T3, K, key_from_message_body<K,T3> >(key_from_message_body<K,T3>()),
+                    new type_to_key_function_body_leaf<T4, K, key_from_message_body<K,T4> >(key_from_message_body<K,T4>()),
+                    new type_to_key_function_body_leaf<T5, K, key_from_message_body<K,T5> >(key_from_message_body<K,T5>()),
+                    new type_to_key_function_body_leaf<T6, K, key_from_message_body<K,T6> >(key_from_message_body<K,T6>())
+                    ) ) {
+        }
+#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+        template<typename Body0, typename Body1, typename Body2, typename Body3, typename Body4,
+                 typename Body5, typename Body6>
+        unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4,
+                Body5 body5, Body6 body6) : base_type(g, func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, Body0>(body0),
+                    new type_to_key_function_body_leaf<T1, K, Body1>(body1),
+                    new type_to_key_function_body_leaf<T2, K, Body2>(body2),
+                    new type_to_key_function_body_leaf<T3, K, Body3>(body3),
+                    new type_to_key_function_body_leaf<T4, K, Body4>(body4),
+                    new type_to_key_function_body_leaf<T5, K, Body5>(body5),
+                    new type_to_key_function_body_leaf<T6, K, Body6>(body6)
+                    ) ) {
+            static_assert(std::tuple_size<OutputTuple>::value == 7, "wrong number of body initializers");
+        }
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+#endif
+
+#if __TBB_VARIADIC_MAX >= 8
+    template<typename OutputTuple, typename K, typename KHash>
+    class unfolded_join_node<8,key_matching_port,OutputTuple,key_matching<K,KHash> > : public
+            join_base<8,key_matching_port,OutputTuple,key_matching<K,KHash> >::type {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+        typedef typename std::tuple_element<3, OutputTuple>::type T3;
+        typedef typename std::tuple_element<4, OutputTuple>::type T4;
+        typedef typename std::tuple_element<5, OutputTuple>::type T5;
+        typedef typename std::tuple_element<6, OutputTuple>::type T6;
+        typedef typename std::tuple_element<7, OutputTuple>::type T7;
+    public:
+        typedef typename wrap_key_tuple_elements<8,key_matching_port,key_matching<K,KHash>,OutputTuple>::type input_ports_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<key_matching<K,KHash> , input_ports_type, output_type > base_type;
+        typedef type_to_key_function_body<T0, K> *f0_p;
+        typedef type_to_key_function_body<T1, K> *f1_p;
+        typedef type_to_key_function_body<T2, K> *f2_p;
+        typedef type_to_key_function_body<T3, K> *f3_p;
+        typedef type_to_key_function_body<T4, K> *f4_p;
+        typedef type_to_key_function_body<T5, K> *f5_p;
+        typedef type_to_key_function_body<T6, K> *f6_p;
+        typedef type_to_key_function_body<T7, K> *f7_p;
+        typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p, f6_p, f7_p > func_initializer_type;
+    public:
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+        unfolded_join_node(graph &g) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, key_from_message_body<K,T0> >(key_from_message_body<K,T0>()),
+                    new type_to_key_function_body_leaf<T1, K, key_from_message_body<K,T1> >(key_from_message_body<K,T1>()),
+                    new type_to_key_function_body_leaf<T2, K, key_from_message_body<K,T2> >(key_from_message_body<K,T2>()),
+                    new type_to_key_function_body_leaf<T3, K, key_from_message_body<K,T3> >(key_from_message_body<K,T3>()),
+                    new type_to_key_function_body_leaf<T4, K, key_from_message_body<K,T4> >(key_from_message_body<K,T4>()),
+                    new type_to_key_function_body_leaf<T5, K, key_from_message_body<K,T5> >(key_from_message_body<K,T5>()),
+                    new type_to_key_function_body_leaf<T6, K, key_from_message_body<K,T6> >(key_from_message_body<K,T6>()),
+                    new type_to_key_function_body_leaf<T7, K, key_from_message_body<K,T7> >(key_from_message_body<K,T7>())
+                    ) ) {
+        }
+#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+        template<typename Body0, typename Body1, typename Body2, typename Body3, typename Body4,
+                 typename Body5, typename Body6, typename Body7>
+        unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4,
+                Body5 body5, Body6 body6, Body7 body7) : base_type(g, func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, Body0>(body0),
+                    new type_to_key_function_body_leaf<T1, K, Body1>(body1),
+                    new type_to_key_function_body_leaf<T2, K, Body2>(body2),
+                    new type_to_key_function_body_leaf<T3, K, Body3>(body3),
+                    new type_to_key_function_body_leaf<T4, K, Body4>(body4),
+                    new type_to_key_function_body_leaf<T5, K, Body5>(body5),
+                    new type_to_key_function_body_leaf<T6, K, Body6>(body6),
+                    new type_to_key_function_body_leaf<T7, K, Body7>(body7)
+                    ) ) {
+            static_assert(std::tuple_size<OutputTuple>::value == 8, "wrong number of body initializers");
+        }
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+#endif
+
+#if __TBB_VARIADIC_MAX >= 9
+    template<typename OutputTuple, typename K, typename KHash>
+    class unfolded_join_node<9,key_matching_port,OutputTuple,key_matching<K,KHash> > : public
+            join_base<9,key_matching_port,OutputTuple,key_matching<K,KHash> >::type {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+        typedef typename std::tuple_element<3, OutputTuple>::type T3;
+        typedef typename std::tuple_element<4, OutputTuple>::type T4;
+        typedef typename std::tuple_element<5, OutputTuple>::type T5;
+        typedef typename std::tuple_element<6, OutputTuple>::type T6;
+        typedef typename std::tuple_element<7, OutputTuple>::type T7;
+        typedef typename std::tuple_element<8, OutputTuple>::type T8;
+    public:
+        typedef typename wrap_key_tuple_elements<9,key_matching_port,key_matching<K,KHash>,OutputTuple>::type input_ports_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<key_matching<K,KHash> , input_ports_type, output_type > base_type;
+        typedef type_to_key_function_body<T0, K> *f0_p;
+        typedef type_to_key_function_body<T1, K> *f1_p;
+        typedef type_to_key_function_body<T2, K> *f2_p;
+        typedef type_to_key_function_body<T3, K> *f3_p;
+        typedef type_to_key_function_body<T4, K> *f4_p;
+        typedef type_to_key_function_body<T5, K> *f5_p;
+        typedef type_to_key_function_body<T6, K> *f6_p;
+        typedef type_to_key_function_body<T7, K> *f7_p;
+        typedef type_to_key_function_body<T8, K> *f8_p;
+        typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p, f6_p, f7_p, f8_p > func_initializer_type;
+    public:
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+        unfolded_join_node(graph &g) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, key_from_message_body<K,T0> >(key_from_message_body<K,T0>()),
+                    new type_to_key_function_body_leaf<T1, K, key_from_message_body<K,T1> >(key_from_message_body<K,T1>()),
+                    new type_to_key_function_body_leaf<T2, K, key_from_message_body<K,T2> >(key_from_message_body<K,T2>()),
+                    new type_to_key_function_body_leaf<T3, K, key_from_message_body<K,T3> >(key_from_message_body<K,T3>()),
+                    new type_to_key_function_body_leaf<T4, K, key_from_message_body<K,T4> >(key_from_message_body<K,T4>()),
+                    new type_to_key_function_body_leaf<T5, K, key_from_message_body<K,T5> >(key_from_message_body<K,T5>()),
+                    new type_to_key_function_body_leaf<T6, K, key_from_message_body<K,T6> >(key_from_message_body<K,T6>()),
+                    new type_to_key_function_body_leaf<T7, K, key_from_message_body<K,T7> >(key_from_message_body<K,T7>()),
+                    new type_to_key_function_body_leaf<T8, K, key_from_message_body<K,T8> >(key_from_message_body<K,T8>())
+                    ) ) {
+        }
+#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+        template<typename Body0, typename Body1, typename Body2, typename Body3, typename Body4,
+                 typename Body5, typename Body6, typename Body7, typename Body8>
+        unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4,
+                Body5 body5, Body6 body6, Body7 body7, Body8 body8) : base_type(g, func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, Body0>(body0),
+                    new type_to_key_function_body_leaf<T1, K, Body1>(body1),
+                    new type_to_key_function_body_leaf<T2, K, Body2>(body2),
+                    new type_to_key_function_body_leaf<T3, K, Body3>(body3),
+                    new type_to_key_function_body_leaf<T4, K, Body4>(body4),
+                    new type_to_key_function_body_leaf<T5, K, Body5>(body5),
+                    new type_to_key_function_body_leaf<T6, K, Body6>(body6),
+                    new type_to_key_function_body_leaf<T7, K, Body7>(body7),
+                    new type_to_key_function_body_leaf<T8, K, Body8>(body8)
+                    ) ) {
+            static_assert(std::tuple_size<OutputTuple>::value == 9, "wrong number of body initializers");
+        }
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+#endif
+
+#if __TBB_VARIADIC_MAX >= 10
+    template<typename OutputTuple, typename K, typename KHash>
+    class unfolded_join_node<10,key_matching_port,OutputTuple,key_matching<K,KHash> > : public
+            join_base<10,key_matching_port,OutputTuple,key_matching<K,KHash> >::type {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+        typedef typename std::tuple_element<3, OutputTuple>::type T3;
+        typedef typename std::tuple_element<4, OutputTuple>::type T4;
+        typedef typename std::tuple_element<5, OutputTuple>::type T5;
+        typedef typename std::tuple_element<6, OutputTuple>::type T6;
+        typedef typename std::tuple_element<7, OutputTuple>::type T7;
+        typedef typename std::tuple_element<8, OutputTuple>::type T8;
+        typedef typename std::tuple_element<9, OutputTuple>::type T9;
+    public:
+        typedef typename wrap_key_tuple_elements<10,key_matching_port,key_matching<K,KHash>,OutputTuple>::type input_ports_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<key_matching<K,KHash> , input_ports_type, output_type > base_type;
+        typedef type_to_key_function_body<T0, K> *f0_p;
+        typedef type_to_key_function_body<T1, K> *f1_p;
+        typedef type_to_key_function_body<T2, K> *f2_p;
+        typedef type_to_key_function_body<T3, K> *f3_p;
+        typedef type_to_key_function_body<T4, K> *f4_p;
+        typedef type_to_key_function_body<T5, K> *f5_p;
+        typedef type_to_key_function_body<T6, K> *f6_p;
+        typedef type_to_key_function_body<T7, K> *f7_p;
+        typedef type_to_key_function_body<T8, K> *f8_p;
+        typedef type_to_key_function_body<T9, K> *f9_p;
+        typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p, f6_p, f7_p, f8_p, f9_p > func_initializer_type;
+    public:
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+        unfolded_join_node(graph &g) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, key_from_message_body<K,T0> >(key_from_message_body<K,T0>()),
+                    new type_to_key_function_body_leaf<T1, K, key_from_message_body<K,T1> >(key_from_message_body<K,T1>()),
+                    new type_to_key_function_body_leaf<T2, K, key_from_message_body<K,T2> >(key_from_message_body<K,T2>()),
+                    new type_to_key_function_body_leaf<T3, K, key_from_message_body<K,T3> >(key_from_message_body<K,T3>()),
+                    new type_to_key_function_body_leaf<T4, K, key_from_message_body<K,T4> >(key_from_message_body<K,T4>()),
+                    new type_to_key_function_body_leaf<T5, K, key_from_message_body<K,T5> >(key_from_message_body<K,T5>()),
+                    new type_to_key_function_body_leaf<T6, K, key_from_message_body<K,T6> >(key_from_message_body<K,T6>()),
+                    new type_to_key_function_body_leaf<T7, K, key_from_message_body<K,T7> >(key_from_message_body<K,T7>()),
+                    new type_to_key_function_body_leaf<T8, K, key_from_message_body<K,T8> >(key_from_message_body<K,T8>()),
+                    new type_to_key_function_body_leaf<T9, K, key_from_message_body<K,T9> >(key_from_message_body<K,T9>())
+                    ) ) {
+        }
+#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+        template<typename Body0, typename Body1, typename Body2, typename Body3, typename Body4,
+            typename Body5, typename Body6, typename Body7, typename Body8, typename Body9>
+        unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4,
+                Body5 body5, Body6 body6, Body7 body7, Body8 body8, Body9 body9) : base_type(g, func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, Body0>(body0),
+                    new type_to_key_function_body_leaf<T1, K, Body1>(body1),
+                    new type_to_key_function_body_leaf<T2, K, Body2>(body2),
+                    new type_to_key_function_body_leaf<T3, K, Body3>(body3),
+                    new type_to_key_function_body_leaf<T4, K, Body4>(body4),
+                    new type_to_key_function_body_leaf<T5, K, Body5>(body5),
+                    new type_to_key_function_body_leaf<T6, K, Body6>(body6),
+                    new type_to_key_function_body_leaf<T7, K, Body7>(body7),
+                    new type_to_key_function_body_leaf<T8, K, Body8>(body8),
+                    new type_to_key_function_body_leaf<T9, K, Body9>(body9)
+                    ) ) {
+            static_assert(std::tuple_size<OutputTuple>::value == 10, "wrong number of body initializers");
+        }
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+#endif
+
+    //! templated function to refer to input ports of the join node
+    template<size_t N, typename JNT>
+    typename std::tuple_element<N, typename JNT::input_ports_type>::type &input_port(JNT &jn) {
+        return std::get<N>(jn.input_ports());
+    }
+
+#endif // __TBB__flow_graph_join_impl_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_node_impl.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_node_impl.h
new file mode 100644
index 0000000000..aca465d088
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_node_impl.h
@@ -0,0 +1,769 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__flow_graph_node_impl_H
+#define __TBB__flow_graph_node_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+#include "_flow_graph_item_buffer_impl.h"
+
+template< typename T, typename A >
+class function_input_queue : public item_buffer<T,A> {
+public:
+    bool empty() const {
+        return this->buffer_empty();
+    }
+
+    const T& front() const {
+        return this->item_buffer<T, A>::front();
+    }
+
+    void pop() {
+        this->destroy_front();
+    }
+
+    bool push( T& t ) {
+        return this->push_back( t );
+    }
+};
+
+//! Input and scheduling for a function node that takes a type Input as input
+//  The only up-ref is apply_body_impl, which should implement the function
+//  call and any handling of the result.
+template< typename Input, typename Policy, typename A, typename ImplType >
+class function_input_base : public receiver<Input>, no_assign {
+    enum op_type {reg_pred, rem_pred, try_fwd, tryput_bypass, app_body_bypass, occupy_concurrency
+    };
+    typedef function_input_base<Input, Policy, A, ImplType> class_type;
+
+public:
+
+    //! The input type of this receiver
+    typedef Input input_type;
+    typedef typename receiver<input_type>::predecessor_type predecessor_type;
+    typedef predecessor_cache<input_type, null_mutex > predecessor_cache_type;
+    typedef function_input_queue<input_type, A> input_queue_type;
+    typedef typename allocator_traits<A>::template rebind_alloc<input_queue_type> allocator_type;
+    static_assert(!has_policy<queueing, Policy>::value || !has_policy<rejecting, Policy>::value, "");
+
+    //! Constructor for function_input_base
+    function_input_base( graph &g, size_t max_concurrency, node_priority_t a_priority )
+        : my_graph_ref(g), my_max_concurrency(max_concurrency)
+        , my_concurrency(0), my_priority(a_priority)
+        , my_queue(!has_policy<rejecting, Policy>::value ? new input_queue_type() : NULL)
+        , my_predecessors(this)
+        , forwarder_busy(false)
+    {
+        my_aggregator.initialize_handler(handler_type(this));
+    }
+
+    //! Copy constructor
+    function_input_base( const function_input_base& src )
+        : function_input_base(src.my_graph_ref, src.my_max_concurrency, src.my_priority) {}
+
+    //! Destructor
+    // The queue is allocated by the constructor for {multi}function_node.
+    // TODO: pass the graph_buffer_policy to the base so it can allocate the queue instead.
+    // This would be an interface-breaking change.
+    virtual ~function_input_base() {
+        if ( my_queue ) delete my_queue;
+    }
+
+    graph_task* try_put_task( const input_type& t) override {
+        return try_put_task_impl(t, has_policy<lightweight, Policy>());
+    }
+
+    //! Adds src to the list of cached predecessors.
+    bool register_predecessor( predecessor_type &src ) override {
+        operation_type op_data(reg_pred);
+        op_data.r = &src;
+        my_aggregator.execute(&op_data);
+        return true;
+    }
+
+    //! Removes src from the list of cached predecessors.
+    bool remove_predecessor( predecessor_type &src ) override {
+        operation_type op_data(rem_pred);
+        op_data.r = &src;
+        my_aggregator.execute(&op_data);
+        return true;
+    }
+
+protected:
+
+    void reset_function_input_base( reset_flags f) {
+        my_concurrency = 0;
+        if(my_queue) {
+            my_queue->reset();
+        }
+        reset_receiver(f);
+        forwarder_busy = false;
+    }
+
+    graph& my_graph_ref;
+    const size_t my_max_concurrency;
+    size_t my_concurrency;
+    node_priority_t my_priority;
+    input_queue_type *my_queue;
+    predecessor_cache<input_type, null_mutex > my_predecessors;
+
+    void reset_receiver( reset_flags f) {
+        if( f & rf_clear_edges) my_predecessors.clear();
+        else
+            my_predecessors.reset();
+        __TBB_ASSERT(!(f & rf_clear_edges) || my_predecessors.empty(), "function_input_base reset failed");
+    }
+
+    graph& graph_reference() const override {
+        return my_graph_ref;
+    }
+
+    graph_task* try_get_postponed_task(const input_type& i) {
+        operation_type op_data(i, app_body_bypass);  // tries to pop an item or get_item
+        my_aggregator.execute(&op_data);
+        return op_data.bypass_t;
+    }
+
+private:
+
+    friend class apply_body_task_bypass< class_type, input_type >;
+    friend class forward_task_bypass< class_type >;
+
+    class operation_type : public aggregated_operation< operation_type > {
+    public:
+        char type;
+        union {
+            input_type *elem;
+            predecessor_type *r;
+        };
+        graph_task* bypass_t;
+        operation_type(const input_type& e, op_type t) :
+            type(char(t)), elem(const_cast<input_type*>(&e)) {}
+        operation_type(op_type t) : type(char(t)), r(NULL) {}
+    };
+
+    bool forwarder_busy;
+    typedef aggregating_functor<class_type, operation_type> handler_type;
+    friend class aggregating_functor<class_type, operation_type>;
+    aggregator< handler_type, operation_type > my_aggregator;
+
+    graph_task* perform_queued_requests() {
+        graph_task* new_task = NULL;
+        if(my_queue) {
+            if(!my_queue->empty()) {
+                ++my_concurrency;
+                new_task = create_body_task(my_queue->front());
+
+                my_queue->pop();
+            }
+        }
+        else {
+            input_type i;
+            if(my_predecessors.get_item(i)) {
+                ++my_concurrency;
+                new_task = create_body_task(i);
+            }
+        }
+        return new_task;
+    }
+    void handle_operations(operation_type *op_list) {
+        operation_type* tmp;
+        while (op_list) {
+            tmp = op_list;
+            op_list = op_list->next;
+            switch (tmp->type) {
+            case reg_pred:
+                my_predecessors.add(*(tmp->r));
+                tmp->status.store(SUCCEEDED, std::memory_order_release);
+                if (!forwarder_busy) {
+                    forwarder_busy = true;
+                    spawn_forward_task();
+                }
+                break;
+            case rem_pred:
+                my_predecessors.remove(*(tmp->r));
+                tmp->status.store(SUCCEEDED, std::memory_order_release);
+                break;
+            case app_body_bypass: {
+                tmp->bypass_t = NULL;
+                __TBB_ASSERT(my_max_concurrency != 0, NULL);
+                --my_concurrency;
+                if(my_concurrency<my_max_concurrency)
+                    tmp->bypass_t = perform_queued_requests();
+                tmp->status.store(SUCCEEDED, std::memory_order_release);
+            }
+                break;
+            case tryput_bypass: internal_try_put_task(tmp);  break;
+            case try_fwd: internal_forward(tmp);  break;
+            case occupy_concurrency:
+                if (my_concurrency < my_max_concurrency) {
+                    ++my_concurrency;
+                    tmp->status.store(SUCCEEDED, std::memory_order_release);
+                } else {
+                    tmp->status.store(FAILED, std::memory_order_release);
+                }
+                break;
+            }
+        }
+    }
+
+    //! Put to the node, but return the task instead of enqueueing it
+    void internal_try_put_task(operation_type *op) {
+        __TBB_ASSERT(my_max_concurrency != 0, NULL);
+        if (my_concurrency < my_max_concurrency) {
+            ++my_concurrency;
+            graph_task * new_task = create_body_task(*(op->elem));
+            op->bypass_t = new_task;
+            op->status.store(SUCCEEDED, std::memory_order_release);
+        } else if ( my_queue && my_queue->push(*(op->elem)) ) {
+            op->bypass_t = SUCCESSFULLY_ENQUEUED;
+            op->status.store(SUCCEEDED, std::memory_order_release);
+        } else {
+            op->bypass_t = NULL;
+            op->status.store(FAILED, std::memory_order_release);
+        }
+    }
+
+    //! Creates tasks for postponed messages if available and if concurrency allows
+    void internal_forward(operation_type *op) {
+        op->bypass_t = NULL;
+        if (my_concurrency < my_max_concurrency)
+            op->bypass_t = perform_queued_requests();
+        if(op->bypass_t)
+            op->status.store(SUCCEEDED, std::memory_order_release);
+        else {
+            forwarder_busy = false;
+            op->status.store(FAILED, std::memory_order_release);
+        }
+    }
+
+    graph_task* internal_try_put_bypass( const input_type& t ) {
+        operation_type op_data(t, tryput_bypass);
+        my_aggregator.execute(&op_data);
+        if( op_data.status == SUCCEEDED ) {
+            return op_data.bypass_t;
+        }
+        return NULL;
+    }
+
+    graph_task* try_put_task_impl( const input_type& t, /*lightweight=*/std::true_type ) {
+        if( my_max_concurrency == 0 ) {
+            return apply_body_bypass(t);
+        } else {
+            operation_type check_op(t, occupy_concurrency);
+            my_aggregator.execute(&check_op);
+            if( check_op.status == SUCCEEDED ) {
+                return apply_body_bypass(t);
+            }
+            return internal_try_put_bypass(t);
+        }
+    }
+
+    graph_task* try_put_task_impl( const input_type& t, /*lightweight=*/std::false_type ) {
+        if( my_max_concurrency == 0 ) {
+            return create_body_task(t);
+        } else {
+            return internal_try_put_bypass(t);
+        }
+    }
+
+    //! Applies the body to the provided input
+    //  then decides if more work is available
+    graph_task* apply_body_bypass( const input_type &i ) {
+        return static_cast<ImplType *>(this)->apply_body_impl_bypass(i);
+    }
+
+    //! allocates a task to apply a body
+    graph_task* create_body_task( const input_type &input ) {
+        if (!is_graph_active(my_graph_ref)) {
+            return nullptr;
+        }
+        // TODO revamp: extract helper for common graph task allocation part
+        small_object_allocator allocator{};
+        typedef apply_body_task_bypass<class_type, input_type> task_type;
+        graph_task* t = allocator.new_object<task_type>( my_graph_ref, allocator, *this, input, my_priority );
+        graph_reference().reserve_wait();
+        return t;
+    }
+
+    //! This is executed by an enqueued task, the "forwarder"
+    graph_task* forward_task() {
+        operation_type op_data(try_fwd);
+        graph_task* rval = NULL;
+        do {
+            op_data.status = WAIT;
+            my_aggregator.execute(&op_data);
+            if(op_data.status == SUCCEEDED) {
+                graph_task* ttask = op_data.bypass_t;
+                __TBB_ASSERT( ttask && ttask != SUCCESSFULLY_ENQUEUED, NULL );
+                rval = combine_tasks(my_graph_ref, rval, ttask);
+            }
+        } while (op_data.status == SUCCEEDED);
+        return rval;
+    }
+
+    inline graph_task* create_forward_task() {
+        if (!is_graph_active(my_graph_ref)) {
+            return nullptr;
+        }
+        small_object_allocator allocator{};
+        typedef forward_task_bypass<class_type> task_type;
+        graph_task* t = allocator.new_object<task_type>( graph_reference(), allocator, *this, my_priority );
+        graph_reference().reserve_wait();
+        return t;
+    }
+
+    //! Spawns a task that calls forward()
+    inline void spawn_forward_task() {
+        graph_task* tp = create_forward_task();
+        if(tp) {
+            spawn_in_graph_arena(graph_reference(), *tp);
+        }
+    }
+
+    node_priority_t priority() const override { return my_priority; }
+};  // function_input_base
+
+//! Implements methods for a function node that takes a type Input as input and sends
+//  a type Output to its successors.
+template< typename Input, typename Output, typename Policy, typename A>
+class function_input : public function_input_base<Input, Policy, A, function_input<Input,Output,Policy,A> > {
+public:
+    typedef Input input_type;
+    typedef Output output_type;
+    typedef function_body<input_type, output_type> function_body_type;
+    typedef function_input<Input, Output, Policy,A> my_class;
+    typedef function_input_base<Input, Policy, A, my_class> base_type;
+    typedef function_input_queue<input_type, A> input_queue_type;
+
+    // constructor
+    template<typename Body>
+    function_input(
+        graph &g, size_t max_concurrency, Body& body, node_priority_t a_priority )
+      : base_type(g, max_concurrency, a_priority)
+      , my_body( new function_body_leaf< input_type, output_type, Body>(body) )
+      , my_init_body( new function_body_leaf< input_type, output_type, Body>(body) ) {
+    }
+
+    //! Copy constructor
+    function_input( const function_input& src ) :
+        base_type(src),
+        my_body( src.my_init_body->clone() ),
+        my_init_body(src.my_init_body->clone() ) {
+    }
+#if __INTEL_COMPILER <= 2021
+    // Suppress superfluous diagnostic about virtual keyword absence in a destructor of an inherited
+    // class while the parent class has the virtual keyword for the destrocutor.
+    virtual
+#endif
+    ~function_input() {
+        delete my_body;
+        delete my_init_body;
+    }
+
+    template< typename Body >
+    Body copy_function_object() {
+        function_body_type &body_ref = *this->my_body;
+        return dynamic_cast< function_body_leaf<input_type, output_type, Body> & >(body_ref).get_body();
+    }
+
+    output_type apply_body_impl( const input_type& i) {
+        // There is an extra copied needed to capture the
+        // body execution without the try_put
+        fgt_begin_body( my_body );
+        output_type v = (*my_body)(i);
+        fgt_end_body( my_body );
+        return v;
+    }
+
+    //TODO: consider moving into the base class
+    graph_task* apply_body_impl_bypass( const input_type &i) {
+        output_type v = apply_body_impl(i);
+        graph_task* postponed_task = NULL;
+        if( base_type::my_max_concurrency != 0 ) {
+            postponed_task = base_type::try_get_postponed_task(i);
+            __TBB_ASSERT( !postponed_task || postponed_task != SUCCESSFULLY_ENQUEUED, NULL );
+        }
+        if( postponed_task ) {
+            // make the task available for other workers since we do not know successors'
+            // execution policy
+            spawn_in_graph_arena(base_type::graph_reference(), *postponed_task);
+        }
+        graph_task* successor_task = successors().try_put_task(v);
+#if _MSC_VER && !__INTEL_COMPILER
+#pragma warning (push)
+#pragma warning (disable: 4127)  /* suppress conditional expression is constant */
+#endif
+        if(has_policy<lightweight, Policy>::value) {
+#if _MSC_VER && !__INTEL_COMPILER
+#pragma warning (pop)
+#endif
+            if(!successor_task) {
+                // Return confirmative status since current
+                // node's body has been executed anyway
+                successor_task = SUCCESSFULLY_ENQUEUED;
+            }
+        }
+        return successor_task;
+    }
+
+protected:
+
+    void reset_function_input(reset_flags f) {
+        base_type::reset_function_input_base(f);
+        if(f & rf_reset_bodies) {
+            function_body_type *tmp = my_init_body->clone();
+            delete my_body;
+            my_body = tmp;
+        }
+    }
+
+    function_body_type *my_body;
+    function_body_type *my_init_body;
+    virtual broadcast_cache<output_type > &successors() = 0;
+
+};  // function_input
+
+
+// helper templates to clear the successor edges of the output ports of an multifunction_node
+template<int N> struct clear_element {
+    template<typename P> static void clear_this(P &p) {
+        (void)std::get<N-1>(p).successors().clear();
+        clear_element<N-1>::clear_this(p);
+    }
+#if TBB_USE_ASSERT
+    template<typename P> static bool this_empty(P &p) {
+        if(std::get<N-1>(p).successors().empty())
+            return clear_element<N-1>::this_empty(p);
+        return false;
+    }
+#endif
+};
+
+template<> struct clear_element<1> {
+    template<typename P> static void clear_this(P &p) {
+        (void)std::get<0>(p).successors().clear();
+    }
+#if TBB_USE_ASSERT
+    template<typename P> static bool this_empty(P &p) {
+        return std::get<0>(p).successors().empty();
+    }
+#endif
+};
+
+template <typename OutputTuple>
+struct init_output_ports {
+    template <typename... Args>
+    static OutputTuple call(graph& g, const std::tuple<Args...>&) {
+        return OutputTuple(Args(g)...);
+    }
+}; // struct init_output_ports
+
+//! Implements methods for a function node that takes a type Input as input
+//  and has a tuple of output ports specified.
+template< typename Input, typename OutputPortSet, typename Policy, typename A>
+class multifunction_input : public function_input_base<Input, Policy, A, multifunction_input<Input,OutputPortSet,Policy,A> > {
+public:
+    static const int N = std::tuple_size<OutputPortSet>::value;
+    typedef Input input_type;
+    typedef OutputPortSet output_ports_type;
+    typedef multifunction_body<input_type, output_ports_type> multifunction_body_type;
+    typedef multifunction_input<Input, OutputPortSet, Policy, A> my_class;
+    typedef function_input_base<Input, Policy, A, my_class> base_type;
+    typedef function_input_queue<input_type, A> input_queue_type;
+
+    // constructor
+    template<typename Body>
+    multifunction_input(graph &g, size_t max_concurrency,Body& body, node_priority_t a_priority )
+      : base_type(g, max_concurrency, a_priority)
+      , my_body( new multifunction_body_leaf<input_type, output_ports_type, Body>(body) )
+      , my_init_body( new multifunction_body_leaf<input_type, output_ports_type, Body>(body) )
+      , my_output_ports(init_output_ports<output_ports_type>::call(g, my_output_ports)){
+    }
+
+    //! Copy constructor
+    multifunction_input( const multifunction_input& src ) :
+        base_type(src),
+        my_body( src.my_init_body->clone() ),
+        my_init_body(src.my_init_body->clone() ),
+        my_output_ports( init_output_ports<output_ports_type>::call(src.my_graph_ref, my_output_ports) ) {
+    }
+
+    ~multifunction_input() {
+        delete my_body;
+        delete my_init_body;
+    }
+
+    template< typename Body >
+    Body copy_function_object() {
+        multifunction_body_type &body_ref = *this->my_body;
+        return *static_cast<Body*>(dynamic_cast< multifunction_body_leaf<input_type, output_ports_type, Body> & >(body_ref).get_body_ptr());
+    }
+
+    // for multifunction nodes we do not have a single successor as such.  So we just tell
+    // the task we were successful.
+    //TODO: consider moving common parts with implementation in function_input into separate function
+    graph_task* apply_body_impl_bypass( const input_type &i ) {
+        fgt_begin_body( my_body );
+        (*my_body)(i, my_output_ports);
+        fgt_end_body( my_body );
+        graph_task* ttask = NULL;
+        if(base_type::my_max_concurrency != 0) {
+            ttask = base_type::try_get_postponed_task(i);
+        }
+        return ttask ? ttask : SUCCESSFULLY_ENQUEUED;
+    }
+
+    output_ports_type &output_ports(){ return my_output_ports; }
+
+protected:
+
+    void reset(reset_flags f) {
+        base_type::reset_function_input_base(f);
+        if(f & rf_clear_edges)clear_element<N>::clear_this(my_output_ports);
+        if(f & rf_reset_bodies) {
+            multifunction_body_type* tmp = my_init_body->clone();
+            delete my_body;
+            my_body = tmp;
+        }
+        __TBB_ASSERT(!(f & rf_clear_edges) || clear_element<N>::this_empty(my_output_ports), "multifunction_node reset failed");
+    }
+
+    multifunction_body_type *my_body;
+    multifunction_body_type *my_init_body;
+    output_ports_type my_output_ports;
+
+};  // multifunction_input
+
+// template to refer to an output port of a multifunction_node
+template<size_t N, typename MOP>
+typename std::tuple_element<N, typename MOP::output_ports_type>::type &output_port(MOP &op) {
+    return std::get<N>(op.output_ports());
+}
+
+inline void check_task_and_spawn(graph& g, graph_task* t) {
+    if (t && t != SUCCESSFULLY_ENQUEUED) {
+        spawn_in_graph_arena(g, *t);
+    }
+}
+
+// helper structs for split_node
+template<int N>
+struct emit_element {
+    template<typename T, typename P>
+    static graph_task* emit_this(graph& g, const T &t, P &p) {
+        // TODO: consider to collect all the tasks in task_list and spawn them all at once
+        graph_task* last_task = std::get<N-1>(p).try_put_task(std::get<N-1>(t));
+        check_task_and_spawn(g, last_task);
+        return emit_element<N-1>::emit_this(g,t,p);
+    }
+};
+
+template<>
+struct emit_element<1> {
+    template<typename T, typename P>
+    static graph_task* emit_this(graph& g, const T &t, P &p) {
+        graph_task* last_task = std::get<0>(p).try_put_task(std::get<0>(t));
+        check_task_and_spawn(g, last_task);
+        return SUCCESSFULLY_ENQUEUED;
+    }
+};
+
+//! Implements methods for an executable node that takes continue_msg as input
+template< typename Output, typename Policy>
+class continue_input : public continue_receiver {
+public:
+
+    //! The input type of this receiver
+    typedef continue_msg input_type;
+
+    //! The output type of this receiver
+    typedef Output output_type;
+    typedef function_body<input_type, output_type> function_body_type;
+    typedef continue_input<output_type, Policy> class_type;
+
+    template< typename Body >
+    continue_input( graph &g, Body& body, node_priority_t a_priority )
+        : continue_receiver(/*number_of_predecessors=*/0, a_priority)
+        , my_graph_ref(g)
+        , my_body( new function_body_leaf< input_type, output_type, Body>(body) )
+        , my_init_body( new function_body_leaf< input_type, output_type, Body>(body) )
+    { }
+
+    template< typename Body >
+    continue_input( graph &g, int number_of_predecessors,
+                    Body& body, node_priority_t a_priority )
+      : continue_receiver( number_of_predecessors, a_priority )
+      , my_graph_ref(g)
+      , my_body( new function_body_leaf< input_type, output_type, Body>(body) )
+      , my_init_body( new function_body_leaf< input_type, output_type, Body>(body) )
+    { }
+
+    continue_input( const continue_input& src ) : continue_receiver(src),
+                                                  my_graph_ref(src.my_graph_ref),
+                                                  my_body( src.my_init_body->clone() ),
+                                                  my_init_body( src.my_init_body->clone() ) {}
+
+    ~continue_input() {
+        delete my_body;
+        delete my_init_body;
+    }
+
+    template< typename Body >
+    Body copy_function_object() {
+        function_body_type &body_ref = *my_body;
+        return dynamic_cast< function_body_leaf<input_type, output_type, Body> & >(body_ref).get_body();
+    }
+
+    void reset_receiver( reset_flags f) override {
+        continue_receiver::reset_receiver(f);
+        if(f & rf_reset_bodies) {
+            function_body_type *tmp = my_init_body->clone();
+            delete my_body;
+            my_body = tmp;
+        }
+    }
+
+protected:
+
+    graph& my_graph_ref;
+    function_body_type *my_body;
+    function_body_type *my_init_body;
+
+    virtual broadcast_cache<output_type > &successors() = 0;
+
+    friend class apply_body_task_bypass< class_type, continue_msg >;
+
+    //! Applies the body to the provided input
+    graph_task* apply_body_bypass( input_type ) {
+        // There is an extra copied needed to capture the
+        // body execution without the try_put
+        fgt_begin_body( my_body );
+        output_type v = (*my_body)( continue_msg() );
+        fgt_end_body( my_body );
+        return successors().try_put_task( v );
+    }
+
+    graph_task* execute() override {
+        if(!is_graph_active(my_graph_ref)) {
+            return NULL;
+        }
+#if _MSC_VER && !__INTEL_COMPILER
+#pragma warning (push)
+#pragma warning (disable: 4127)  /* suppress conditional expression is constant */
+#endif
+        if(has_policy<lightweight, Policy>::value) {
+#if _MSC_VER && !__INTEL_COMPILER
+#pragma warning (pop)
+#endif
+            return apply_body_bypass( continue_msg() );
+        }
+        else {
+            small_object_allocator allocator{};
+            typedef apply_body_task_bypass<class_type, continue_msg> task_type;
+            graph_task* t = allocator.new_object<task_type>( graph_reference(), allocator, *this, continue_msg(), my_priority );
+            graph_reference().reserve_wait();
+            return t;
+        }
+    }
+
+    graph& graph_reference() const override {
+        return my_graph_ref;
+    }
+};  // continue_input
+
+//! Implements methods for both executable and function nodes that puts Output to its successors
+template< typename Output >
+class function_output : public sender<Output> {
+public:
+
+    template<int N> friend struct clear_element;
+    typedef Output output_type;
+    typedef typename sender<output_type>::successor_type successor_type;
+    typedef broadcast_cache<output_type> broadcast_cache_type;
+
+    function_output(graph& g) : my_successors(this), my_graph_ref(g) {}
+    function_output(const function_output& other) = delete;
+
+    //! Adds a new successor to this node
+    bool register_successor( successor_type &r ) override {
+        successors().register_successor( r );
+        return true;
+    }
+
+    //! Removes a successor from this node
+    bool remove_successor( successor_type &r ) override {
+        successors().remove_successor( r );
+        return true;
+    }
+
+    broadcast_cache_type &successors() { return my_successors; }
+
+    graph& graph_reference() const { return my_graph_ref; }
+protected:
+    broadcast_cache_type my_successors;
+    graph& my_graph_ref;
+};  // function_output
+
+template< typename Output >
+class multifunction_output : public function_output<Output> {
+public:
+    typedef Output output_type;
+    typedef function_output<output_type> base_type;
+    using base_type::my_successors;
+
+    multifunction_output(graph& g) : base_type(g) {}
+    multifunction_output(const multifunction_output& other) : base_type(other.my_graph_ref) {}
+
+    bool try_put(const output_type &i) {
+        graph_task *res = try_put_task(i);
+        if( !res ) return false;
+        if( res != SUCCESSFULLY_ENQUEUED ) {
+            // wrapping in task_arena::execute() is not needed since the method is called from
+            // inside task::execute()
+            spawn_in_graph_arena(graph_reference(), *res);
+        }
+        return true;
+    }
+
+    using base_type::graph_reference;
+
+protected:
+
+    graph_task* try_put_task(const output_type &i) {
+        return my_successors.try_put_task(i);
+    }
+
+    template <int N> friend struct emit_element;
+
+};  // multifunction_output
+
+//composite_node
+template<typename CompositeType>
+void add_nodes_impl(CompositeType*, bool) {}
+
+template< typename CompositeType, typename NodeType1, typename... NodeTypes >
+void add_nodes_impl(CompositeType *c_node, bool visible, const NodeType1& n1, const NodeTypes&... n) {
+    void *addr = const_cast<NodeType1 *>(&n1);
+
+    fgt_alias_port(c_node, addr, visible);
+    add_nodes_impl(c_node, visible, n...);
+}
+
+#endif // __TBB__flow_graph_node_impl_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_node_set_impl.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_node_set_impl.h
new file mode 100644
index 0000000000..ce867121f9
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_node_set_impl.h
@@ -0,0 +1,265 @@
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_flow_graph_node_set_impl_H
+#define __TBB_flow_graph_node_set_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+// Included in namespace tbb::detail::d1 (in flow_graph.h)
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+// Visual Studio 2019 reports an error while calling predecessor_selector::get and successor_selector::get
+// Seems like the well-formed expression in trailing decltype is treated as ill-formed
+// TODO: investigate problems with decltype in trailing return types or find the cross-platform solution
+#define __TBB_MSVC_DISABLE_TRAILING_DECLTYPE (_MSC_VER >= 1900)
+
+namespace order {
+struct undefined {};
+struct following {};
+struct preceding {};
+}
+
+class get_graph_helper {
+public:
+    // TODO: consider making graph_reference() public and consistent interface to get a reference to the graph
+    // and remove get_graph_helper
+    template <typename T>
+    static graph& get(const T& object) {
+        return get_impl(object, std::is_base_of<graph_node, T>());
+    }
+
+private:
+    // Get graph from the object of type derived from graph_node
+    template <typename T>
+    static graph& get_impl(const T& object, std::true_type) {
+        return static_cast<const graph_node*>(&object)->my_graph;
+    }
+
+    template <typename T>
+    static graph& get_impl(const T& object, std::false_type) {
+        return object.graph_reference();
+    }
+};
+
+template<typename Order, typename... Nodes>
+struct node_set {
+    typedef Order order_type;
+
+    std::tuple<Nodes&...> nodes;
+    node_set(Nodes&... ns) : nodes(ns...) {}
+
+    template <typename... Nodes2>
+    node_set(const node_set<order::undefined, Nodes2...>& set) : nodes(set.nodes) {}
+
+    graph& graph_reference() const {
+        return get_graph_helper::get(std::get<0>(nodes));
+    }
+};
+
+namespace alias_helpers {
+template <typename T> using output_type = typename T::output_type;
+template <typename T> using output_ports_type = typename T::output_ports_type;
+template <typename T> using input_type = typename T::input_type;
+template <typename T> using input_ports_type = typename T::input_ports_type;
+} // namespace alias_helpers
+
+template <typename T>
+using has_output_type = supports<T, alias_helpers::output_type>;
+
+template <typename T>
+using has_input_type = supports<T, alias_helpers::input_type>;
+
+template <typename T>
+using has_input_ports_type = supports<T, alias_helpers::input_ports_type>;
+
+template <typename T>
+using has_output_ports_type = supports<T, alias_helpers::output_ports_type>;
+
+template<typename T>
+struct is_sender : std::is_base_of<sender<typename T::output_type>, T> {};
+
+template<typename T>
+struct is_receiver : std::is_base_of<receiver<typename T::input_type>, T> {};
+
+template <typename Node>
+struct is_async_node : std::false_type {};
+
+template <typename... Args>
+struct is_async_node<async_node<Args...>> : std::true_type {};
+
+template<typename FirstPredecessor, typename... Predecessors>
+node_set<order::following, FirstPredecessor, Predecessors...>
+follows(FirstPredecessor& first_predecessor, Predecessors&... predecessors) {
+    static_assert((conjunction<has_output_type<FirstPredecessor>,
+                                                   has_output_type<Predecessors>...>::value),
+                        "Not all node's predecessors has output_type typedef");
+    static_assert((conjunction<is_sender<FirstPredecessor>, is_sender<Predecessors>...>::value),
+                        "Not all node's predecessors are senders");
+    return node_set<order::following, FirstPredecessor, Predecessors...>(first_predecessor, predecessors...);
+}
+
+template<typename... Predecessors>
+node_set<order::following, Predecessors...>
+follows(node_set<order::undefined, Predecessors...>& predecessors_set) {
+    static_assert((conjunction<has_output_type<Predecessors>...>::value),
+                        "Not all nodes in the set has output_type typedef");
+    static_assert((conjunction<is_sender<Predecessors>...>::value),
+                        "Not all nodes in the set are senders");
+    return node_set<order::following, Predecessors...>(predecessors_set);
+}
+
+template<typename FirstSuccessor, typename... Successors>
+node_set<order::preceding, FirstSuccessor, Successors...>
+precedes(FirstSuccessor& first_successor, Successors&... successors) {
+    static_assert((conjunction<has_input_type<FirstSuccessor>,
+                                                    has_input_type<Successors>...>::value),
+                        "Not all node's successors has input_type typedef");
+    static_assert((conjunction<is_receiver<FirstSuccessor>, is_receiver<Successors>...>::value),
+                        "Not all node's successors are receivers");
+    return node_set<order::preceding, FirstSuccessor, Successors...>(first_successor, successors...);
+}
+
+template<typename... Successors>
+node_set<order::preceding, Successors...>
+precedes(node_set<order::undefined, Successors...>& successors_set) {
+    static_assert((conjunction<has_input_type<Successors>...>::value),
+                        "Not all nodes in the set has input_type typedef");
+    static_assert((conjunction<is_receiver<Successors>...>::value),
+                        "Not all nodes in the set are receivers");
+    return node_set<order::preceding, Successors...>(successors_set);
+}
+
+template <typename Node, typename... Nodes>
+node_set<order::undefined, Node, Nodes...>
+make_node_set(Node& first_node, Nodes&... nodes) {
+    return node_set<order::undefined, Node, Nodes...>(first_node, nodes...);
+}
+
+template<size_t I>
+class successor_selector {
+    template <typename NodeType>
+    static auto get_impl(NodeType& node, std::true_type) -> decltype(input_port<I>(node)) {
+        return input_port<I>(node);
+    }
+
+    template <typename NodeType>
+    static NodeType& get_impl(NodeType& node, std::false_type) { return node; }
+
+public:
+    template <typename NodeType>
+#if __TBB_MSVC_DISABLE_TRAILING_DECLTYPE
+    static auto& get(NodeType& node)
+#else
+    static auto get(NodeType& node) -> decltype(get_impl(node, has_input_ports_type<NodeType>()))
+#endif
+    {
+        return get_impl(node, has_input_ports_type<NodeType>());
+    }
+};
+
+template<size_t I>
+class predecessor_selector {
+    template <typename NodeType>
+    static auto internal_get(NodeType& node, std::true_type) -> decltype(output_port<I>(node)) {
+        return output_port<I>(node);
+    }
+
+    template <typename NodeType>
+    static NodeType& internal_get(NodeType& node, std::false_type) { return node;}
+
+    template <typename NodeType>
+#if __TBB_MSVC_DISABLE_TRAILING_DECLTYPE
+    static auto& get_impl(NodeType& node, std::false_type)
+#else
+    static auto get_impl(NodeType& node, std::false_type) -> decltype(internal_get(node, has_output_ports_type<NodeType>()))
+#endif
+    {
+        return internal_get(node, has_output_ports_type<NodeType>());
+    }
+
+    template <typename AsyncNode>
+    static AsyncNode& get_impl(AsyncNode& node, std::true_type) { return node; }
+
+public:
+    template <typename NodeType>
+#if __TBB_MSVC_DISABLE_TRAILING_DECLTYPE
+    static auto& get(NodeType& node)
+#else
+    static auto get(NodeType& node) -> decltype(get_impl(node, is_async_node<NodeType>()))
+#endif
+    {
+        return get_impl(node, is_async_node<NodeType>());
+    }
+};
+
+template<size_t I>
+class make_edges_helper {
+public:
+    template<typename PredecessorsTuple, typename NodeType>
+    static void connect_predecessors(PredecessorsTuple& predecessors, NodeType& node) {
+        make_edge(std::get<I>(predecessors), successor_selector<I>::get(node));
+        make_edges_helper<I - 1>::connect_predecessors(predecessors, node);
+    }
+
+    template<typename SuccessorsTuple, typename NodeType>
+    static void connect_successors(NodeType& node, SuccessorsTuple& successors) {
+        make_edge(predecessor_selector<I>::get(node), std::get<I>(successors));
+        make_edges_helper<I - 1>::connect_successors(node, successors);
+    }
+};
+
+template<>
+struct make_edges_helper<0> {
+    template<typename PredecessorsTuple, typename NodeType>
+    static void connect_predecessors(PredecessorsTuple& predecessors, NodeType& node) {
+        make_edge(std::get<0>(predecessors), successor_selector<0>::get(node));
+    }
+
+    template<typename SuccessorsTuple, typename NodeType>
+    static void connect_successors(NodeType& node, SuccessorsTuple& successors) {
+        make_edge(predecessor_selector<0>::get(node), std::get<0>(successors));
+    }
+};
+
+// TODO: consider adding an overload for making edges between node sets
+template<typename NodeType, typename OrderFlagType, typename... Args>
+void make_edges(const node_set<OrderFlagType, Args...>& s, NodeType& node) {
+    const std::size_t SetSize = std::tuple_size<decltype(s.nodes)>::value;
+    make_edges_helper<SetSize - 1>::connect_predecessors(s.nodes, node);
+}
+
+template <typename NodeType, typename OrderFlagType, typename... Args>
+void make_edges(NodeType& node, const node_set<OrderFlagType, Args...>& s) {
+    const std::size_t SetSize = std::tuple_size<decltype(s.nodes)>::value;
+    make_edges_helper<SetSize - 1>::connect_successors(node, s.nodes);
+}
+
+template <typename NodeType, typename... Nodes>
+void make_edges_in_order(const node_set<order::following, Nodes...>& ns, NodeType& node) {
+    make_edges(ns, node);
+}
+
+template <typename NodeType, typename... Nodes>
+void make_edges_in_order(const node_set<order::preceding, Nodes...>& ns, NodeType& node) {
+    make_edges(node, ns);
+}
+
+#endif  // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+#endif // __TBB_flow_graph_node_set_impl_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_nodes_deduction.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_nodes_deduction.h
new file mode 100644
index 0000000000..8c20993795
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_nodes_deduction.h
@@ -0,0 +1,277 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_flow_graph_nodes_deduction_H
+#define __TBB_flow_graph_nodes_deduction_H
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename Input, typename Output>
+struct declare_body_types {
+    using input_type = Input;
+    using output_type = Output;
+};
+
+struct NoInputBody {};
+
+template <typename Output>
+struct declare_body_types<NoInputBody, Output> {
+    using output_type = Output;
+};
+
+template <typename T> struct body_types;
+
+template <typename T, typename Input, typename Output>
+struct body_types<Output (T::*)(const Input&) const> : declare_body_types<Input, Output> {};
+
+template <typename T, typename Input, typename Output>
+struct body_types<Output (T::*)(const Input&)> : declare_body_types<Input, Output> {};
+
+template <typename T, typename Input, typename Output>
+struct body_types<Output (T::*)(Input&) const> : declare_body_types<Input, Output> {};
+
+template <typename T, typename Input, typename Output>
+struct body_types<Output (T::*)(Input&)> : declare_body_types<Input, Output> {};
+
+template <typename T, typename Output>
+struct body_types<Output (T::*)(flow_control&) const> : declare_body_types<NoInputBody, Output> {};
+
+template <typename T, typename Output>
+struct body_types<Output (T::*)(flow_control&)> : declare_body_types<NoInputBody, Output> {};
+
+template <typename Input, typename Output>
+struct body_types<Output (*)(Input&)> : declare_body_types<Input, Output> {};
+
+template <typename Input, typename Output>
+struct body_types<Output (*)(const Input&)> : declare_body_types<Input, Output> {};
+
+template <typename Output>
+struct body_types<Output (*)(flow_control&)> : declare_body_types<NoInputBody, Output> {};
+
+template <typename Body>
+using input_t = typename body_types<Body>::input_type;
+
+template <typename Body>
+using output_t = typename body_types<Body>::output_type;
+
+template <typename T, typename Input, typename Output>
+auto decide_on_operator_overload(Output (T::*name)(const Input&) const)->decltype(name);
+
+template <typename T, typename Input, typename Output>
+auto decide_on_operator_overload(Output (T::*name)(const Input&))->decltype(name);
+
+template <typename T, typename Input, typename Output>
+auto decide_on_operator_overload(Output (T::*name)(Input&) const)->decltype(name);
+
+template <typename T, typename Input, typename Output>
+auto decide_on_operator_overload(Output (T::*name)(Input&))->decltype(name);
+
+template <typename Input, typename Output>
+auto decide_on_operator_overload(Output (*name)(const Input&))->decltype(name);
+
+template <typename Input, typename Output>
+auto decide_on_operator_overload(Output (*name)(Input&))->decltype(name);
+
+template <typename Body>
+decltype(decide_on_operator_overload(&Body::operator())) decide_on_callable_type(int);
+
+template <typename Body>
+decltype(decide_on_operator_overload(std::declval<Body>())) decide_on_callable_type(...);
+
+// Deduction guides for Flow Graph nodes
+
+template <typename GraphOrSet, typename Body>
+input_node(GraphOrSet&&, Body)
+->input_node<output_t<decltype(decide_on_callable_type<Body>(0))>>;
+    
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+template <typename NodeSet>
+struct decide_on_set;
+
+template <typename Node, typename... Nodes>
+struct decide_on_set<node_set<order::following, Node, Nodes...>> {
+    using type = typename Node::output_type;
+};
+
+template <typename Node, typename... Nodes>
+struct decide_on_set<node_set<order::preceding, Node, Nodes...>> {
+    using type = typename Node::input_type;
+};
+
+template <typename NodeSet>
+using decide_on_set_t = typename decide_on_set<std::decay_t<NodeSet>>::type;
+
+template <typename NodeSet>
+broadcast_node(const NodeSet&)
+->broadcast_node<decide_on_set_t<NodeSet>>;
+
+template <typename NodeSet>
+buffer_node(const NodeSet&)
+->buffer_node<decide_on_set_t<NodeSet>>;
+
+template <typename NodeSet>
+queue_node(const NodeSet&)
+->queue_node<decide_on_set_t<NodeSet>>;
+#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+template <typename GraphOrProxy, typename Sequencer>
+sequencer_node(GraphOrProxy&&, Sequencer)
+->sequencer_node<input_t<decltype(decide_on_callable_type<Sequencer>(0))>>;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+template <typename NodeSet, typename Compare>
+priority_queue_node(const NodeSet&, const Compare&)
+->priority_queue_node<decide_on_set_t<NodeSet>, Compare>;
+
+template <typename NodeSet>
+priority_queue_node(const NodeSet&)
+->priority_queue_node<decide_on_set_t<NodeSet>, std::less<decide_on_set_t<NodeSet>>>;
+#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+template <typename Key>
+struct join_key {
+    using type = Key;
+};
+
+template <typename T>
+struct join_key<const T&> {
+    using type = T&;
+};
+
+template <typename Key>
+using join_key_t = typename join_key<Key>::type;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+template <typename Policy, typename... Predecessors>
+join_node(const node_set<order::following, Predecessors...>&, Policy)
+->join_node<std::tuple<typename Predecessors::output_type...>,
+            Policy>;
+
+template <typename Policy, typename Successor, typename... Successors>
+join_node(const node_set<order::preceding, Successor, Successors...>&, Policy)
+->join_node<typename Successor::input_type, Policy>;
+
+template <typename... Predecessors>
+join_node(const node_set<order::following, Predecessors...>)
+->join_node<std::tuple<typename Predecessors::output_type...>,
+            queueing>;
+
+template <typename Successor, typename... Successors>
+join_node(const node_set<order::preceding, Successor, Successors...>)
+->join_node<typename Successor::input_type, queueing>;
+#endif
+
+template <typename GraphOrProxy, typename Body, typename... Bodies>
+join_node(GraphOrProxy&&, Body, Bodies...)
+->join_node<std::tuple<input_t<decltype(decide_on_callable_type<Body>(0))>,
+                       input_t<decltype(decide_on_callable_type<Bodies>(0))>...>,
+            key_matching<join_key_t<output_t<decltype(decide_on_callable_type<Body>(0))>>>>;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+template <typename... Predecessors>
+indexer_node(const node_set<order::following, Predecessors...>&)
+->indexer_node<typename Predecessors::output_type...>;
+#endif
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+template <typename NodeSet>
+limiter_node(const NodeSet&, size_t)
+->limiter_node<decide_on_set_t<NodeSet>>;
+
+template <typename Predecessor, typename... Predecessors>
+split_node(const node_set<order::following, Predecessor, Predecessors...>&)
+->split_node<typename Predecessor::output_type>;
+
+template <typename... Successors>
+split_node(const node_set<order::preceding, Successors...>&)
+->split_node<std::tuple<typename Successors::input_type...>>;
+
+#endif
+
+template <typename GraphOrSet, typename Body, typename Policy>
+function_node(GraphOrSet&&,
+              size_t, Body,
+              Policy, node_priority_t = no_priority)
+->function_node<input_t<decltype(decide_on_callable_type<Body>(0))>,
+                output_t<decltype(decide_on_callable_type<Body>(0))>,
+                Policy>;
+
+template <typename GraphOrSet, typename Body>
+function_node(GraphOrSet&&, size_t,
+              Body, node_priority_t = no_priority)
+->function_node<input_t<decltype(decide_on_callable_type<Body>(0))>,
+                output_t<decltype(decide_on_callable_type<Body>(0))>,
+                queueing>;
+
+template <typename Output>
+struct continue_output {
+    using type = Output;
+};
+
+template <>
+struct continue_output<void> {
+    using type = continue_msg;
+};
+
+template <typename T>
+using continue_output_t = typename continue_output<T>::type;
+
+template <typename GraphOrSet, typename Body, typename Policy>
+continue_node(GraphOrSet&&, Body,
+              Policy, node_priority_t = no_priority)
+->continue_node<continue_output_t<std::invoke_result_t<Body, continue_msg>>,
+                Policy>;
+
+template <typename GraphOrSet, typename Body, typename Policy>
+continue_node(GraphOrSet&&,
+              int, Body,
+              Policy, node_priority_t = no_priority)
+->continue_node<continue_output_t<std::invoke_result_t<Body, continue_msg>>,
+                Policy>;
+
+template <typename GraphOrSet, typename Body>
+continue_node(GraphOrSet&&,
+              Body, node_priority_t = no_priority)
+->continue_node<continue_output_t<std::invoke_result_t<Body, continue_msg>>, Policy<void>>;
+
+template <typename GraphOrSet, typename Body>
+continue_node(GraphOrSet&&, int,
+              Body, node_priority_t = no_priority)
+->continue_node<continue_output_t<std::invoke_result_t<Body, continue_msg>>,
+                Policy<void>>;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+template <typename NodeSet>
+overwrite_node(const NodeSet&)
+->overwrite_node<decide_on_set_t<NodeSet>>;
+
+template <typename NodeSet>
+write_once_node(const NodeSet&)
+->write_once_node<decide_on_set_t<NodeSet>>;
+#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+#endif // __TBB_flow_graph_nodes_deduction_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_tagged_buffer_impl.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_tagged_buffer_impl.h
new file mode 100644
index 0000000000..0c4580a199
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_tagged_buffer_impl.h
@@ -0,0 +1,256 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+// a hash table buffer that can expand, and can support as many deletions as
+// additions, list-based, with elements of list held in array (for destruction
+// management), multiplicative hashing (like ets).  No synchronization built-in.
+//
+
+#ifndef __TBB__flow_graph_hash_buffer_impl_H
+#define __TBB__flow_graph_hash_buffer_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+// included in namespace tbb::flow::interfaceX::internal
+
+// elements in the table are a simple list; we need pointer to next element to
+// traverse the chain
+template<typename ValueType>
+struct buffer_element_type {
+    // the second parameter below is void * because we can't forward-declare the type
+    // itself, so we just reinterpret_cast below.
+    typedef typename aligned_pair<ValueType, void *>::type type;
+};
+
+template
+    <
+     typename Key,         // type of key within ValueType
+     typename ValueType,
+     typename ValueToKey,  // abstract method that returns "const Key" or "const Key&" given ValueType
+     typename HashCompare, // has hash and equal
+     typename Allocator=tbb::cache_aligned_allocator< typename aligned_pair<ValueType, void *>::type >
+    >
+class hash_buffer : public HashCompare {
+public:
+    static const size_t INITIAL_SIZE = 8;  // initial size of the hash pointer table
+    typedef ValueType value_type;
+    typedef typename buffer_element_type< value_type >::type element_type;
+    typedef value_type *pointer_type;
+    typedef element_type *list_array_type;  // array we manage manually
+    typedef list_array_type *pointer_array_type;
+    typedef typename std::allocator_traits<Allocator>::template rebind_alloc<list_array_type> pointer_array_allocator_type;
+    typedef typename std::allocator_traits<Allocator>::template rebind_alloc<element_type> elements_array_allocator;
+    typedef typename std::decay<Key>::type Knoref;
+
+private:
+    ValueToKey *my_key;
+    size_t my_size;
+    size_t nelements;
+    pointer_array_type pointer_array;    // pointer_array[my_size]
+    list_array_type elements_array;      // elements_array[my_size / 2]
+    element_type* free_list;
+
+    size_t mask() { return my_size - 1; }
+
+    void set_up_free_list( element_type **p_free_list, list_array_type la, size_t sz) {
+        for(size_t i=0; i < sz - 1; ++i ) {  // construct free list
+            la[i].second = &(la[i+1]);
+        }
+        la[sz-1].second = NULL;
+        *p_free_list = (element_type *)&(la[0]);
+    }
+
+    // cleanup for exceptions
+    struct DoCleanup {
+        pointer_array_type *my_pa;
+        list_array_type *my_elements;
+        size_t my_size;
+
+        DoCleanup(pointer_array_type &pa, list_array_type &my_els, size_t sz) :
+            my_pa(&pa), my_elements(&my_els), my_size(sz) {  }
+        ~DoCleanup() {
+            if(my_pa) {
+                size_t dont_care = 0;
+                internal_free_buffer(*my_pa, *my_elements, my_size, dont_care);
+            }
+        }
+    };
+
+    // exception-safety requires we do all the potentially-throwing operations first
+    void grow_array() {
+        size_t new_size = my_size*2;
+        size_t new_nelements = nelements;  // internal_free_buffer zeroes this
+        list_array_type new_elements_array = NULL;
+        pointer_array_type new_pointer_array = NULL;
+        list_array_type new_free_list = NULL;
+        {
+            DoCleanup my_cleanup(new_pointer_array, new_elements_array, new_size);
+            new_elements_array = elements_array_allocator().allocate(my_size);
+            new_pointer_array = pointer_array_allocator_type().allocate(new_size);
+            for(size_t i=0; i < new_size; ++i) new_pointer_array[i] = NULL;
+            set_up_free_list(&new_free_list, new_elements_array, my_size );
+
+            for(size_t i=0; i < my_size; ++i) {
+                for( element_type* op = pointer_array[i]; op; op = (element_type *)(op->second)) {
+                    value_type *ov = reinterpret_cast<value_type *>(&(op->first));
+                    // could have std::move semantics
+                    internal_insert_with_key(new_pointer_array, new_size, new_free_list, *ov);
+                }
+            }
+            my_cleanup.my_pa = NULL;
+            my_cleanup.my_elements = NULL;
+        }
+
+        internal_free_buffer(pointer_array, elements_array, my_size, nelements);
+        free_list = new_free_list;
+        pointer_array = new_pointer_array;
+        elements_array = new_elements_array;
+        my_size = new_size;
+        nelements = new_nelements;
+    }
+
+    // v should have perfect forwarding if std::move implemented.
+    // we use this method to move elements in grow_array, so can't use class fields
+    void internal_insert_with_key( element_type **p_pointer_array, size_t p_sz, list_array_type &p_free_list,
+            const value_type &v) {
+        size_t l_mask = p_sz-1;
+        __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
+        size_t h = this->hash((*my_key)(v)) & l_mask;
+        __TBB_ASSERT(p_free_list, "Error: free list not set up.");
+        element_type* my_elem = p_free_list; p_free_list = (element_type *)(p_free_list->second);
+        (void) new(&(my_elem->first)) value_type(v);
+        my_elem->second = p_pointer_array[h];
+        p_pointer_array[h] = my_elem;
+    }
+
+    void internal_initialize_buffer() {
+        pointer_array = pointer_array_allocator_type().allocate(my_size);
+        for(size_t i = 0; i < my_size; ++i) pointer_array[i] = NULL;
+        elements_array = elements_array_allocator().allocate(my_size / 2);
+        set_up_free_list(&free_list, elements_array, my_size / 2);
+    }
+
+    // made static so an enclosed class can use to properly dispose of the internals
+    static void internal_free_buffer( pointer_array_type &pa, list_array_type &el, size_t &sz, size_t &ne ) {
+        if(pa) {
+            for(size_t i = 0; i < sz; ++i ) {
+                element_type *p_next;
+                for( element_type *p = pa[i]; p; p = p_next) {
+                    p_next = (element_type *)p->second;
+                    // TODO revamp: make sure type casting is correct.
+                    void* ptr = (void*)(p->first);
+#if _MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER
+                    suppress_unused_warning(ptr);
+#endif
+                    ((value_type*)ptr)->~value_type();
+                }
+            }
+            pointer_array_allocator_type().deallocate(pa, sz);
+            pa = NULL;
+        }
+        // Separate test (if allocation of pa throws, el may be allocated.
+        // but no elements will be constructed.)
+        if(el) {
+            elements_array_allocator().deallocate(el, sz / 2);
+            el = NULL;
+        }
+        sz = INITIAL_SIZE;
+        ne = 0;
+    }
+
+public:
+    hash_buffer() : my_key(NULL), my_size(INITIAL_SIZE), nelements(0) {
+        internal_initialize_buffer();
+    }
+
+    ~hash_buffer() {
+        internal_free_buffer(pointer_array, elements_array, my_size, nelements);
+        if(my_key) delete my_key;
+    }
+    hash_buffer(const hash_buffer&) = delete;
+    hash_buffer& operator=(const hash_buffer&) = delete;
+
+    void reset() {
+        internal_free_buffer(pointer_array, elements_array, my_size, nelements);
+        internal_initialize_buffer();
+    }
+
+    // Take ownership of func object allocated with new.
+    // This method is only used internally, so can't be misused by user.
+    void set_key_func(ValueToKey *vtk) { my_key = vtk; }
+    // pointer is used to clone()
+    ValueToKey* get_key_func() { return my_key; }
+
+    bool insert_with_key(const value_type &v) {
+        pointer_type p = NULL;
+        __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
+        if(find_ref_with_key((*my_key)(v), p)) {
+            p->~value_type();
+            (void) new(p) value_type(v);  // copy-construct into the space
+            return false;
+        }
+        ++nelements;
+        if(nelements*2 > my_size) grow_array();
+        internal_insert_with_key(pointer_array, my_size, free_list, v);
+        return true;
+    }
+
+    // returns true and sets v to array element if found, else returns false.
+    bool find_ref_with_key(const Knoref& k, pointer_type &v) {
+        size_t i = this->hash(k) & mask();
+        for(element_type* p = pointer_array[i]; p; p = (element_type *)(p->second)) {
+            pointer_type pv = reinterpret_cast<pointer_type>(&(p->first));
+            __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
+            if(this->equal((*my_key)(*pv), k)) {
+                v = pv;
+                return true;
+            }
+        }
+        return false;
+    }
+
+    bool find_with_key( const Knoref& k, value_type &v) {
+        value_type *p;
+        if(find_ref_with_key(k, p)) {
+            v = *p;
+            return true;
+        }
+        else
+            return false;
+    }
+
+    void delete_with_key(const Knoref& k) {
+        size_t h = this->hash(k) & mask();
+        element_type* prev = NULL;
+        for(element_type* p = pointer_array[h]; p; prev = p, p = (element_type *)(p->second)) {
+            value_type *vp = reinterpret_cast<value_type *>(&(p->first));
+            __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
+            if(this->equal((*my_key)(*vp), k)) {
+                vp->~value_type();
+                if(prev) prev->second = p->second;
+                else pointer_array[h] = (element_type *)(p->second);
+                p->second = free_list;
+                free_list = p;
+                --nelements;
+                return;
+            }
+        }
+        __TBB_ASSERT(false, "key not found for delete");
+    }
+};
+#endif // __TBB__flow_graph_hash_buffer_impl_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_trace_impl.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_trace_impl.h
new file mode 100644
index 0000000000..d8256ca8a2
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_trace_impl.h
@@ -0,0 +1,364 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _FGT_GRAPH_TRACE_IMPL_H
+#define _FGT_GRAPH_TRACE_IMPL_H
+
+#include "../profiling.h"
+#if (_MSC_VER >= 1900)
+    #include <intrin.h>
+#endif
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template< typename T > class sender;
+template< typename T > class receiver;
+
+#if TBB_USE_PROFILING_TOOLS
+    #if __TBB_FLOW_TRACE_CODEPTR
+        #if (_MSC_VER >= 1900)
+            #define CODEPTR() (_ReturnAddress())
+        #elif __TBB_GCC_VERSION >= 40800
+            #define CODEPTR() ( __builtin_return_address(0))
+        #else
+            #define CODEPTR() NULL
+        #endif
+    #else
+        #define CODEPTR() NULL
+    #endif /* __TBB_FLOW_TRACE_CODEPTR */
+
+static inline void fgt_alias_port(void *node, void *p, bool visible) {
+    if(visible)
+        itt_relation_add( ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_NODE );
+    else
+        itt_relation_add( ITT_DOMAIN_FLOW, p, FLOW_NODE, __itt_relation_is_child_of, node, FLOW_NODE );
+}
+
+static inline void fgt_composite ( void* codeptr, void *node, void *graph ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_COMPOSITE_NODE );
+    suppress_unused_warning( codeptr );
+#if __TBB_FLOW_TRACE_CODEPTR
+    if (codeptr != NULL) {
+        register_node_addr(ITT_DOMAIN_FLOW, node, FLOW_NODE, CODE_ADDRESS, &codeptr);
+    }
+#endif
+}
+
+static inline void fgt_internal_alias_input_port( void *node, void *p, string_resource_index name_index ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, p, FLOW_INPUT_PORT, node, FLOW_NODE, name_index );
+    itt_relation_add( ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_INPUT_PORT );
+}
+
+static inline void fgt_internal_alias_output_port( void *node, void *p, string_resource_index name_index ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, p, FLOW_OUTPUT_PORT, node, FLOW_NODE, name_index );
+    itt_relation_add( ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_OUTPUT_PORT );
+}
+
+template<typename InputType>
+void alias_input_port(void *node, receiver<InputType>* port, string_resource_index name_index) {
+    // TODO: Make fgt_internal_alias_input_port a function template?
+    fgt_internal_alias_input_port( node, port, name_index);
+}
+
+template < typename PortsTuple, int N >
+struct fgt_internal_input_alias_helper {
+    static void alias_port( void *node, PortsTuple &ports ) {
+        alias_input_port( node, &(std::get<N-1>(ports)), static_cast<string_resource_index>(FLOW_INPUT_PORT_0 + N - 1) );
+        fgt_internal_input_alias_helper<PortsTuple, N-1>::alias_port( node, ports );
+    }
+};
+
+template < typename PortsTuple >
+struct fgt_internal_input_alias_helper<PortsTuple, 0> {
+    static void alias_port( void * /* node */, PortsTuple & /* ports */ ) { }
+};
+
+template<typename OutputType>
+void alias_output_port(void *node, sender<OutputType>* port, string_resource_index name_index) {
+    // TODO: Make fgt_internal_alias_output_port a function template?
+    fgt_internal_alias_output_port( node, static_cast<void *>(port), name_index);
+}
+
+template < typename PortsTuple, int N >
+struct fgt_internal_output_alias_helper {
+    static void alias_port( void *node, PortsTuple &ports ) {
+        alias_output_port( node, &(std::get<N-1>(ports)), static_cast<string_resource_index>(FLOW_OUTPUT_PORT_0 + N - 1) );
+        fgt_internal_output_alias_helper<PortsTuple, N-1>::alias_port( node, ports );
+    }
+};
+
+template < typename PortsTuple >
+struct fgt_internal_output_alias_helper<PortsTuple, 0> {
+    static void alias_port( void * /*node*/, PortsTuple &/*ports*/ ) {
+    }
+};
+
+static inline void fgt_internal_create_input_port( void *node, void *p, string_resource_index name_index ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, p, FLOW_INPUT_PORT, node, FLOW_NODE, name_index );
+}
+
+static inline void fgt_internal_create_output_port( void* codeptr, void *node, void *p, string_resource_index name_index ) {
+    itt_make_task_group(ITT_DOMAIN_FLOW, p, FLOW_OUTPUT_PORT, node, FLOW_NODE, name_index);
+    suppress_unused_warning( codeptr );
+#if __TBB_FLOW_TRACE_CODEPTR
+    if (codeptr != NULL) {
+        register_node_addr(ITT_DOMAIN_FLOW, node, FLOW_NODE, CODE_ADDRESS, &codeptr);
+    }
+#endif
+}
+
+template<typename InputType>
+void register_input_port(void *node, receiver<InputType>* port, string_resource_index name_index) {
+    // TODO: Make fgt_internal_create_input_port a function template?
+    fgt_internal_create_input_port(node, static_cast<void*>(port), name_index);
+}
+
+template < typename PortsTuple, int N >
+struct fgt_internal_input_helper {
+    static void register_port( void *node, PortsTuple &ports ) {
+        register_input_port( node, &(std::get<N-1>(ports)), static_cast<string_resource_index>(FLOW_INPUT_PORT_0 + N - 1) );
+        fgt_internal_input_helper<PortsTuple, N-1>::register_port( node, ports );
+    }
+};
+
+template < typename PortsTuple >
+struct fgt_internal_input_helper<PortsTuple, 1> {
+    static void register_port( void *node, PortsTuple &ports ) {
+        register_input_port( node, &(std::get<0>(ports)), FLOW_INPUT_PORT_0 );
+    }
+};
+
+template<typename OutputType>
+void register_output_port(void* codeptr, void *node, sender<OutputType>* port, string_resource_index name_index) {
+    // TODO: Make fgt_internal_create_output_port a function template?
+    fgt_internal_create_output_port( codeptr, node, static_cast<void *>(port), name_index);
+}
+
+template < typename PortsTuple, int N >
+struct fgt_internal_output_helper {
+    static void register_port( void* codeptr, void *node, PortsTuple &ports ) {
+        register_output_port( codeptr, node, &(std::get<N-1>(ports)), static_cast<string_resource_index>(FLOW_OUTPUT_PORT_0 + N - 1) );
+        fgt_internal_output_helper<PortsTuple, N-1>::register_port( codeptr, node, ports );
+    }
+};
+
+template < typename PortsTuple >
+struct fgt_internal_output_helper<PortsTuple,1> {
+    static void register_port( void* codeptr, void *node, PortsTuple &ports ) {
+        register_output_port( codeptr, node, &(std::get<0>(ports)), FLOW_OUTPUT_PORT_0 );
+    }
+};
+
+template< typename NodeType >
+void fgt_multioutput_node_desc( const NodeType *node, const char *desc ) {
+    void *addr =  (void *)( static_cast< receiver< typename NodeType::input_type > * >(const_cast< NodeType *>(node)) );
+    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
+}
+
+template< typename NodeType >
+void fgt_multiinput_multioutput_node_desc( const NodeType *node, const char *desc ) {
+    void *addr =  const_cast<NodeType *>(node);
+    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
+}
+
+template< typename NodeType >
+static inline void fgt_node_desc( const NodeType *node, const char *desc ) {
+    void *addr =  (void *)( static_cast< sender< typename NodeType::output_type > * >(const_cast< NodeType *>(node)) );
+    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
+}
+
+static inline void fgt_graph_desc( const void *g, const char *desc ) {
+    void *addr = const_cast< void *>(g);
+    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_GRAPH, FLOW_OBJECT_NAME, desc );
+}
+
+static inline void fgt_body( void *node, void *body ) {
+    itt_relation_add( ITT_DOMAIN_FLOW, body, FLOW_BODY, __itt_relation_is_child_of, node, FLOW_NODE );
+}
+
+template< int N, typename PortsTuple >
+static inline void fgt_multioutput_node(void* codeptr, string_resource_index t, void *g, void *input_port, PortsTuple &ports ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, input_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    fgt_internal_create_input_port( input_port, input_port, FLOW_INPUT_PORT_0 );
+    fgt_internal_output_helper<PortsTuple, N>::register_port(codeptr, input_port, ports );
+}
+
+template< int N, typename PortsTuple >
+static inline void fgt_multioutput_node_with_body( void* codeptr, string_resource_index t, void *g, void *input_port, PortsTuple &ports, void *body ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, input_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    fgt_internal_create_input_port( input_port, input_port, FLOW_INPUT_PORT_0 );
+    fgt_internal_output_helper<PortsTuple, N>::register_port( codeptr, input_port, ports );
+    fgt_body( input_port, body );
+}
+
+template< int N, typename PortsTuple >
+static inline void fgt_multiinput_node( void* codeptr, string_resource_index t, void *g, PortsTuple &ports, void *output_port) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    fgt_internal_create_output_port( codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 );
+    fgt_internal_input_helper<PortsTuple, N>::register_port( output_port, ports );
+}
+
+static inline void fgt_multiinput_multioutput_node( void* codeptr, string_resource_index t, void *n, void *g ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, n, FLOW_NODE, g, FLOW_GRAPH, t );
+    suppress_unused_warning( codeptr );
+#if __TBB_FLOW_TRACE_CODEPTR
+    if (codeptr != NULL) {
+        register_node_addr(ITT_DOMAIN_FLOW, n, FLOW_NODE, CODE_ADDRESS, &codeptr);
+    }
+#endif
+}
+
+static inline void fgt_node( void* codeptr, string_resource_index t, void *g, void *output_port ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    fgt_internal_create_output_port( codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 );
+}
+
+static void fgt_node_with_body( void* codeptr, string_resource_index t, void *g, void *output_port, void *body ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    fgt_internal_create_output_port(codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 );
+    fgt_body( output_port, body );
+}
+
+static inline void fgt_node( void* codeptr, string_resource_index t, void *g, void *input_port, void *output_port ) {
+    fgt_node( codeptr, t, g, output_port );
+    fgt_internal_create_input_port( output_port, input_port, FLOW_INPUT_PORT_0 );
+}
+
+static inline void  fgt_node_with_body( void* codeptr, string_resource_index t, void *g, void *input_port, void *output_port, void *body ) {
+    fgt_node_with_body( codeptr, t, g, output_port, body );
+    fgt_internal_create_input_port( output_port, input_port, FLOW_INPUT_PORT_0 );
+}
+
+
+static inline void  fgt_node( void* codeptr, string_resource_index t, void *g, void *input_port, void *decrement_port, void *output_port ) {
+    fgt_node( codeptr, t, g, input_port, output_port );
+    fgt_internal_create_input_port( output_port, decrement_port, FLOW_INPUT_PORT_1 );
+}
+
+static inline void fgt_make_edge( void *output_port, void *input_port ) {
+    itt_relation_add( ITT_DOMAIN_FLOW, output_port, FLOW_OUTPUT_PORT, __itt_relation_is_predecessor_to, input_port, FLOW_INPUT_PORT);
+}
+
+static inline void fgt_remove_edge( void *output_port, void *input_port ) {
+    itt_relation_add( ITT_DOMAIN_FLOW, output_port, FLOW_OUTPUT_PORT, __itt_relation_is_sibling_of, input_port, FLOW_INPUT_PORT);
+}
+
+static inline void fgt_graph( void *g ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, g, FLOW_GRAPH, NULL, FLOW_NULL, FLOW_GRAPH );
+}
+
+static inline void fgt_begin_body( void *body ) {
+    itt_task_begin( ITT_DOMAIN_FLOW, body, FLOW_BODY, NULL, FLOW_NULL, FLOW_BODY );
+}
+
+static inline void fgt_end_body( void * ) {
+    itt_task_end( ITT_DOMAIN_FLOW );
+}
+
+static inline void fgt_async_try_put_begin( void *node, void *port ) {
+    itt_task_begin( ITT_DOMAIN_FLOW, port, FLOW_OUTPUT_PORT, node, FLOW_NODE, FLOW_OUTPUT_PORT );
+}
+
+static inline void fgt_async_try_put_end( void *, void * ) {
+    itt_task_end( ITT_DOMAIN_FLOW );
+}
+
+static inline void fgt_async_reserve( void *node, void *graph ) {
+    itt_region_begin( ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_NULL );
+}
+
+static inline void fgt_async_commit( void *node, void * /*graph*/) {
+    itt_region_end( ITT_DOMAIN_FLOW, node, FLOW_NODE );
+}
+
+static inline void fgt_reserve_wait( void *graph ) {
+    itt_region_begin( ITT_DOMAIN_FLOW, graph, FLOW_GRAPH, NULL, FLOW_NULL, FLOW_NULL );
+}
+
+static inline void fgt_release_wait( void *graph ) {
+    itt_region_end( ITT_DOMAIN_FLOW, graph, FLOW_GRAPH );
+}
+
+#else // TBB_USE_PROFILING_TOOLS
+
+#define CODEPTR() NULL
+
+static inline void fgt_alias_port(void * /*node*/, void * /*p*/, bool /*visible*/ ) { }
+
+static inline void fgt_composite ( void* /*codeptr*/, void * /*node*/, void * /*graph*/ ) { }
+
+static inline void fgt_graph( void * /*g*/ ) { }
+
+template< typename NodeType >
+static inline void fgt_multioutput_node_desc( const NodeType * /*node*/, const char * /*desc*/ ) { }
+
+template< typename NodeType >
+static inline void fgt_node_desc( const NodeType * /*node*/, const char * /*desc*/ ) { }
+
+static inline void fgt_graph_desc( const void * /*g*/, const char * /*desc*/ ) { }
+
+template< int N, typename PortsTuple >
+static inline void fgt_multioutput_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, PortsTuple & /*ports*/ ) { }
+
+template< int N, typename PortsTuple >
+static inline void fgt_multioutput_node_with_body( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, PortsTuple & /*ports*/, void * /*body*/ ) { }
+
+template< int N, typename PortsTuple >
+static inline void fgt_multiinput_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, PortsTuple & /*ports*/, void * /*output_port*/ ) { }
+
+static inline void fgt_multiinput_multioutput_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*node*/, void * /*graph*/ ) { }
+
+static inline void fgt_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, void * /*output_port*/ ) { }
+static inline void  fgt_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, void * /*decrement_port*/, void * /*output_port*/ ) { }
+
+static inline void fgt_node_with_body( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*output_port*/, void * /*body*/ ) { }
+static inline void fgt_node_with_body( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, void * /*output_port*/, void * /*body*/ ) { }
+
+static inline void fgt_make_edge( void * /*output_port*/, void * /*input_port*/ ) { }
+static inline void fgt_remove_edge( void * /*output_port*/, void * /*input_port*/ ) { }
+
+static inline void fgt_begin_body( void * /*body*/ ) { }
+static inline void fgt_end_body( void *  /*body*/) { }
+
+static inline void fgt_async_try_put_begin( void * /*node*/, void * /*port*/ ) { }
+static inline void fgt_async_try_put_end( void * /*node*/ , void * /*port*/ ) { }
+static inline void fgt_async_reserve( void * /*node*/, void * /*graph*/ ) { }
+static inline void fgt_async_commit( void * /*node*/, void * /*graph*/ ) { }
+static inline void fgt_reserve_wait( void * /*graph*/ ) { }
+static inline void fgt_release_wait( void * /*graph*/ ) { }
+
+template< typename NodeType >
+void fgt_multiinput_multioutput_node_desc( const NodeType * /*node*/, const char * /*desc*/ ) { }
+
+template < typename PortsTuple, int N >
+struct fgt_internal_input_alias_helper {
+    static void alias_port( void * /*node*/, PortsTuple & /*ports*/ ) { }
+};
+
+template < typename PortsTuple, int N >
+struct fgt_internal_output_alias_helper {
+    static void alias_port( void * /*node*/, PortsTuple & /*ports*/ ) { }
+};
+
+#endif // TBB_USE_PROFILING_TOOLS
+
+} // d1
+} // namespace detail
+} // namespace tbb
+
+#endif // _FGT_GRAPH_TRACE_IMPL_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_types_impl.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_types_impl.h
new file mode 100644
index 0000000000..97c770b154
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_flow_graph_types_impl.h
@@ -0,0 +1,407 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__flow_graph_types_impl_H
+#define __TBB__flow_graph_types_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+// included in namespace tbb::detail::d1
+
+// the change to key_matching (adding a K and KHash template parameter, making it a class)
+// means we have to pass this data to the key_matching_port.  All the ports have only one
+// template parameter, so we have to wrap the following types in a trait:
+//
+//    . K == key_type
+//    . KHash == hash and compare for Key
+//    . TtoK == function_body that given an object of T, returns its K
+//    . T == type accepted by port, and stored in the hash table
+//
+// The port will have an additional parameter on node construction, which is a function_body
+// that accepts a const T& and returns a K which is the field in T which is its K.
+template<typename Kp, typename KHashp, typename Tp>
+struct KeyTrait {
+    typedef Kp K;
+    typedef Tp T;
+    typedef type_to_key_function_body<T,K> TtoK;
+    typedef KHashp KHash;
+};
+
+// wrap each element of a tuple in a template, and make a tuple of the result.
+template<int N, template<class> class PT, typename TypeTuple>
+struct wrap_tuple_elements;
+
+// A wrapper that generates the traits needed for each port of a key-matching join,
+// and the type of the tuple of input ports.
+template<int N, template<class> class PT, typename KeyTraits, typename TypeTuple>
+struct wrap_key_tuple_elements;
+
+template<int N, template<class> class PT,  typename... Args>
+struct wrap_tuple_elements<N, PT, std::tuple<Args...> >{
+    typedef typename std::tuple<PT<Args>... > type;
+};
+
+template<int N, template<class> class PT, typename KeyTraits, typename... Args>
+struct wrap_key_tuple_elements<N, PT, KeyTraits, std::tuple<Args...> > {
+    typedef typename KeyTraits::key_type K;
+    typedef typename KeyTraits::hash_compare_type KHash;
+    typedef typename std::tuple<PT<KeyTrait<K, KHash, Args> >... > type;
+};
+
+template< int... S > class sequence {};
+
+template< int N, int... S >
+struct make_sequence : make_sequence < N - 1, N - 1, S... > {};
+
+template< int... S >
+struct make_sequence < 0, S... > {
+    typedef sequence<S...> type;
+};
+
+//! type mimicking std::pair but with trailing fill to ensure each element of an array
+//* will have the correct alignment
+template<typename T1, typename T2, size_t REM>
+struct type_plus_align {
+    char first[sizeof(T1)];
+    T2 second;
+    char fill1[REM];
+};
+
+template<typename T1, typename T2>
+struct type_plus_align<T1,T2,0> {
+    char first[sizeof(T1)];
+    T2 second;
+};
+
+template<class U> struct alignment_of {
+    typedef struct { char t; U    padded; } test_alignment;
+    static const size_t value = sizeof(test_alignment) - sizeof(U);
+};
+
+// T1, T2 are actual types stored.  The space defined for T1 in the type returned
+// is a char array of the correct size.  Type T2 should be trivially-constructible,
+// T1 must be explicitly managed.
+template<typename T1, typename T2>
+struct aligned_pair {
+    static const size_t t1_align = alignment_of<T1>::value;
+    static const size_t t2_align = alignment_of<T2>::value;
+    typedef type_plus_align<T1, T2, 0 > just_pair;
+    static const size_t max_align = t1_align < t2_align ? t2_align : t1_align;
+    static const size_t extra_bytes = sizeof(just_pair) % max_align;
+    static const size_t remainder = extra_bytes ? max_align - extra_bytes : 0;
+public:
+    typedef type_plus_align<T1,T2,remainder> type;
+};  // aligned_pair
+
+// support for variant type
+// type we use when we're not storing a value
+struct default_constructed { };
+
+// type which contains another type, tests for what type is contained, and references to it.
+// Wrapper<T>
+//     void CopyTo( void *newSpace) : builds a Wrapper<T> copy of itself in newSpace
+
+// struct to allow us to copy and test the type of objects
+struct WrapperBase {
+    virtual ~WrapperBase() {}
+    virtual void CopyTo(void* /*newSpace*/) const = 0;
+};
+
+// Wrapper<T> contains a T, with the ability to test what T is.  The Wrapper<T> can be
+// constructed from a T, can be copy-constructed from another Wrapper<T>, and can be
+// examined via value(), but not modified.
+template<typename T>
+struct Wrapper: public WrapperBase {
+    typedef T value_type;
+    typedef T* pointer_type;
+private:
+    T value_space;
+public:
+    const value_type &value() const { return value_space; }
+
+private:
+    Wrapper();
+
+    // on exception will ensure the Wrapper will contain only a trivially-constructed object
+    struct _unwind_space {
+        pointer_type space;
+        _unwind_space(pointer_type p) : space(p) {}
+        ~_unwind_space() {
+            if(space) (void) new (space) Wrapper<default_constructed>(default_constructed());
+        }
+    };
+public:
+    explicit Wrapper( const T& other ) : value_space(other) { }
+    explicit Wrapper(const Wrapper& other) = delete;
+
+    void CopyTo(void* newSpace) const override {
+        _unwind_space guard((pointer_type)newSpace);
+        (void) new(newSpace) Wrapper(value_space);
+        guard.space = NULL;
+    }
+    ~Wrapper() { }
+};
+
+// specialization for array objects
+template<typename T, size_t N>
+struct Wrapper<T[N]> : public WrapperBase {
+    typedef T value_type;
+    typedef T* pointer_type;
+    // space must be untyped.
+    typedef T ArrayType[N];
+private:
+    // The space is not of type T[N] because when copy-constructing, it would be
+    // default-initialized and then copied to in some fashion, resulting in two
+    // constructions and one destruction per element.  If the type is char[ ], we
+    // placement new into each element, resulting in one construction per element.
+    static const size_t space_size = sizeof(ArrayType) / sizeof(char);
+    char value_space[space_size];
+
+
+    // on exception will ensure the already-built objects will be destructed
+    // (the value_space is a char array, so it is already trivially-destructible.)
+    struct _unwind_class {
+        pointer_type space;
+        int    already_built;
+        _unwind_class(pointer_type p) : space(p), already_built(0) {}
+        ~_unwind_class() {
+            if(space) {
+                for(size_t i = already_built; i > 0 ; --i ) space[i-1].~value_type();
+                (void) new(space) Wrapper<default_constructed>(default_constructed());
+            }
+        }
+    };
+public:
+    const ArrayType &value() const {
+        char *vp = const_cast<char *>(value_space);
+        return reinterpret_cast<ArrayType &>(*vp);
+    }
+
+private:
+    Wrapper();
+public:
+    // have to explicitly construct because other decays to a const value_type*
+    explicit Wrapper(const ArrayType& other) {
+        _unwind_class guard((pointer_type)value_space);
+        pointer_type vp = reinterpret_cast<pointer_type>(&value_space);
+        for(size_t i = 0; i < N; ++i ) {
+            (void) new(vp++) value_type(other[i]);
+            ++(guard.already_built);
+        }
+        guard.space = NULL;
+    }
+    explicit Wrapper(const Wrapper& other) : WrapperBase() {
+        // we have to do the heavy lifting to copy contents
+        _unwind_class guard((pointer_type)value_space);
+        pointer_type dp = reinterpret_cast<pointer_type>(value_space);
+        pointer_type sp = reinterpret_cast<pointer_type>(const_cast<char *>(other.value_space));
+        for(size_t i = 0; i < N; ++i, ++dp, ++sp) {
+            (void) new(dp) value_type(*sp);
+            ++(guard.already_built);
+        }
+        guard.space = NULL;
+    }
+
+    void CopyTo(void* newSpace) const override {
+        (void) new(newSpace) Wrapper(*this);  // exceptions handled in copy constructor
+    }
+
+    ~Wrapper() {
+        // have to destroy explicitly in reverse order
+        pointer_type vp = reinterpret_cast<pointer_type>(&value_space);
+        for(size_t i = N; i > 0 ; --i ) vp[i-1].~value_type();
+    }
+};
+
+// given a tuple, return the type of the element that has the maximum alignment requirement.
+// Given a tuple and that type, return the number of elements of the object with the max
+// alignment requirement that is at least as big as the largest object in the tuple.
+
+template<bool, class T1, class T2> struct pick_one;
+template<class T1, class T2> struct pick_one<true , T1, T2> { typedef T1 type; };
+template<class T1, class T2> struct pick_one<false, T1, T2> { typedef T2 type; };
+
+template< template<class> class Selector, typename T1, typename T2 >
+struct pick_max {
+    typedef typename pick_one< (Selector<T1>::value > Selector<T2>::value), T1, T2 >::type type;
+};
+
+template<typename T> struct size_of { static const int value = sizeof(T); };
+
+template< size_t N, class Tuple, template<class> class Selector > struct pick_tuple_max {
+    typedef typename pick_tuple_max<N-1, Tuple, Selector>::type LeftMaxType;
+    typedef typename std::tuple_element<N-1, Tuple>::type ThisType;
+    typedef typename pick_max<Selector, LeftMaxType, ThisType>::type type;
+};
+
+template< class Tuple, template<class> class Selector > struct pick_tuple_max<0, Tuple, Selector> {
+    typedef typename std::tuple_element<0, Tuple>::type type;
+};
+
+// is the specified type included in a tuple?
+template<class Q, size_t N, class Tuple>
+struct is_element_of {
+    typedef typename std::tuple_element<N-1, Tuple>::type T_i;
+    static const bool value = std::is_same<Q,T_i>::value || is_element_of<Q,N-1,Tuple>::value;
+};
+
+template<class Q, class Tuple>
+struct is_element_of<Q,0,Tuple> {
+    typedef typename std::tuple_element<0, Tuple>::type T_i;
+    static const bool value = std::is_same<Q,T_i>::value;
+};
+
+// allow the construction of types that are listed tuple.  If a disallowed type
+// construction is written, a method involving this type is created.  The
+// type has no definition, so a syntax error is generated.
+template<typename T> struct ERROR_Type_Not_allowed_In_Tagged_Msg_Not_Member_Of_Tuple;
+
+template<typename T, bool BUILD_IT> struct do_if;
+template<typename T>
+struct do_if<T, true> {
+    static void construct(void *mySpace, const T& x) {
+        (void) new(mySpace) Wrapper<T>(x);
+    }
+};
+template<typename T>
+struct do_if<T, false> {
+    static void construct(void * /*mySpace*/, const T& x) {
+        // This method is instantiated when the type T does not match any of the
+        // element types in the Tuple in variant<Tuple>.
+        ERROR_Type_Not_allowed_In_Tagged_Msg_Not_Member_Of_Tuple<T>::bad_type(x);
+    }
+};
+
+// Tuple tells us the allowed types that variant can hold.  It determines the alignment of the space in
+// Wrapper, and how big Wrapper is.
+//
+// the object can only be tested for type, and a read-only reference can be fetched by cast_to<T>().
+
+using tbb::detail::punned_cast;
+struct tagged_null_type {};
+template<typename TagType, typename T0, typename T1=tagged_null_type, typename T2=tagged_null_type, typename T3=tagged_null_type,
+                           typename T4=tagged_null_type, typename T5=tagged_null_type, typename T6=tagged_null_type,
+                           typename T7=tagged_null_type, typename T8=tagged_null_type, typename T9=tagged_null_type>
+class tagged_msg {
+    typedef std::tuple<T0, T1, T2, T3, T4
+                  //TODO: Should we reject lists longer than a tuple can hold?
+                  #if __TBB_VARIADIC_MAX >= 6
+                  , T5
+                  #endif
+                  #if __TBB_VARIADIC_MAX >= 7
+                  , T6
+                  #endif
+                  #if __TBB_VARIADIC_MAX >= 8
+                  , T7
+                  #endif
+                  #if __TBB_VARIADIC_MAX >= 9
+                  , T8
+                  #endif
+                  #if __TBB_VARIADIC_MAX >= 10
+                  , T9
+                  #endif
+                  > Tuple;
+
+private:
+    class variant {
+        static const size_t N = std::tuple_size<Tuple>::value;
+        typedef typename pick_tuple_max<N, Tuple, alignment_of>::type AlignType;
+        typedef typename pick_tuple_max<N, Tuple, size_of>::type MaxSizeType;
+        static const size_t MaxNBytes = (sizeof(Wrapper<MaxSizeType>)+sizeof(AlignType)-1);
+        static const size_t MaxNElements = MaxNBytes/sizeof(AlignType);
+        typedef aligned_space<AlignType, MaxNElements> SpaceType;
+        SpaceType my_space;
+        static const size_t MaxSize = sizeof(SpaceType);
+
+    public:
+        variant() { (void) new(&my_space) Wrapper<default_constructed>(default_constructed()); }
+
+        template<typename T>
+        variant( const T& x ) {
+            do_if<T, is_element_of<T, N, Tuple>::value>::construct(&my_space,x);
+        }
+
+        variant(const variant& other) {
+            const WrapperBase * h = punned_cast<const WrapperBase *>(&(other.my_space));
+            h->CopyTo(&my_space);
+        }
+
+        // assignment must destroy and re-create the Wrapper type, as there is no way
+        // to create a Wrapper-to-Wrapper assign even if we find they agree in type.
+        void operator=( const variant& rhs ) {
+            if(&rhs != this) {
+                WrapperBase *h = punned_cast<WrapperBase *>(&my_space);
+                h->~WrapperBase();
+                const WrapperBase *ch = punned_cast<const WrapperBase *>(&(rhs.my_space));
+                ch->CopyTo(&my_space);
+            }
+        }
+
+        template<typename U>
+        const U& variant_cast_to() const {
+            const Wrapper<U> *h = dynamic_cast<const Wrapper<U>*>(punned_cast<const WrapperBase *>(&my_space));
+            if(!h) {
+                throw_exception(exception_id::bad_tagged_msg_cast);
+            }
+            return h->value();
+        }
+        template<typename U>
+        bool variant_is_a() const { return dynamic_cast<const Wrapper<U>*>(punned_cast<const WrapperBase *>(&my_space)) != NULL; }
+
+        bool variant_is_default_constructed() const {return variant_is_a<default_constructed>();}
+
+        ~variant() {
+            WrapperBase *h = punned_cast<WrapperBase *>(&my_space);
+            h->~WrapperBase();
+        }
+    }; //class variant
+
+    TagType my_tag;
+    variant my_msg;
+
+public:
+    tagged_msg(): my_tag(TagType(~0)), my_msg(){}
+
+    template<typename T, typename R>
+    tagged_msg(T const &index, R const &value) : my_tag(index), my_msg(value) {}
+
+    template<typename T, typename R, size_t N>
+    tagged_msg(T const &index,  R (&value)[N]) : my_tag(index), my_msg(value) {}
+
+    void set_tag(TagType const &index) {my_tag = index;}
+    TagType tag() const {return my_tag;}
+
+    template<typename V>
+    const V& cast_to() const {return my_msg.template variant_cast_to<V>();}
+
+    template<typename V>
+    bool is_a() const {return my_msg.template variant_is_a<V>();}
+
+    bool is_default_constructed() const {return my_msg.variant_is_default_constructed();}
+}; //class tagged_msg
+
+// template to simplify cast and test for tagged_msg in template contexts
+template<typename V, typename T>
+const V& cast_to(T const &t) { return t.template cast_to<V>(); }
+
+template<typename V, typename T>
+bool is_a(T const &t) { return t.template is_a<V>(); }
+
+enum op_stat { WAIT = 0, SUCCEEDED, FAILED };
+
+#endif  /* __TBB__flow_graph_types_impl_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_hash_compare.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_hash_compare.h
new file mode 100644
index 0000000000..20cbd96c06
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_hash_compare.h
@@ -0,0 +1,127 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__hash_compare_H
+#define __TBB_detail__hash_compare_H
+
+#include <functional>
+
+#include "_containers_helpers.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename Key, typename Hash, typename KeyEqual>
+class hash_compare {
+    using is_transparent_hash = has_transparent_key_equal<Key, Hash, KeyEqual>;
+public:
+    using hasher = Hash;
+    using key_equal = typename is_transparent_hash::type;
+
+    hash_compare() = default;
+    hash_compare( hasher hash, key_equal equal ) : my_hasher(hash), my_equal(equal) {}
+
+    std::size_t operator()( const Key& key ) const {
+        return std::size_t(my_hasher(key));
+    }
+
+    bool operator()( const Key& key1, const Key& key2 ) const {
+        return my_equal(key1, key2);
+    }
+
+    template <typename K, typename = typename std::enable_if<is_transparent_hash::value, K>::type>
+    std::size_t operator()( const K& key ) const {
+        return std::size_t(my_hasher(key));
+    }
+
+    template <typename K1, typename K2, typename = typename std::enable_if<is_transparent_hash::value, K1>::type>
+    bool operator()( const K1& key1, const K2& key2 ) const {
+        return my_equal(key1, key2);
+    }
+
+    hasher hash_function() const {
+        return my_hasher;
+    }
+
+    key_equal key_eq() const {
+        return my_equal;
+    }
+
+
+private:
+    hasher my_hasher;
+    key_equal my_equal;
+}; // class hash_compare
+
+//! hash_compare that is default argument for concurrent_hash_map
+template <typename Key>
+class tbb_hash_compare {
+public:
+    std::size_t hash( const Key& a ) const { return my_hash_func(a); }
+    bool equal( const Key& a, const Key& b ) const { return my_key_equal(a, b); }
+private:
+    std::hash<Key> my_hash_func;
+    std::equal_to<Key> my_key_equal;
+};
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#if TBB_DEFINE_STD_HASH_SPECIALIZATIONS
+
+namespace std {
+
+template <typename T, typename U>
+struct hash<std::pair<T, U>> {
+public:
+    std::size_t operator()( const std::pair<T, U>& p ) const {
+        return first_hash(p.first) ^ second_hash(p.second);
+    }
+
+private:
+    std::hash<T> first_hash;
+    std::hash<U> second_hash;
+}; // struct hash<std::pair>
+
+// Apple clang and MSVC defines their own specializations for std::hash<std::basic_string<T, Traits, Alloc>>
+#if !(_LIBCPP_VERSION) && !(_CPPLIB_VER)
+
+template <typename CharT, typename Traits, typename Allocator>
+struct hash<std::basic_string<CharT, Traits, Allocator>> {
+public:
+    std::size_t operator()( const std::basic_string<CharT, Traits, Allocator>& s ) const {
+        std::size_t h = 0;
+        for ( const CharT* c = s.c_str(); *c; ++c ) {
+            h = h * hash_multiplier ^ char_hash(*c);
+        }
+        return h;
+    }
+
+private:
+    static constexpr std::size_t hash_multiplier = tbb::detail::select_size_t_constant<2654435769U, 11400714819323198485ULL>::value;
+
+    std::hash<CharT> char_hash;
+}; // struct hash<std::basic_string>
+
+#endif // !(_LIBCPP_VERSION || _CPPLIB_VER)
+
+} // namespace std
+
+#endif // TBB_DEFINE_STD_HASH_SPECIALIZATIONS
+
+#endif // __TBB_detail__hash_compare_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_machine.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_machine.h
new file mode 100644
index 0000000000..3270da786a
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_machine.h
@@ -0,0 +1,366 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__machine_H
+#define __TBB_detail__machine_H
+
+#include "_config.h"
+#include "_assert.h"
+
+#include <atomic>
+#include <climits>
+#include <cstdint>
+#include <cstddef>
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#pragma intrinsic(__rdtsc)
+#endif
+#if __TBB_x86_64 || __TBB_x86_32
+#include <immintrin.h> // _mm_pause
+#endif
+#if (_WIN32 || _WIN64)
+#include <float.h> // _control87
+#endif
+
+#if __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN
+#include <sched.h> // sched_yield
+#else
+#include <thread> // std::this_thread::yield()
+#endif
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+
+//--------------------------------------------------------------------------------------------------
+// Yield implementation
+//--------------------------------------------------------------------------------------------------
+
+#if __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN
+static inline void yield() {
+    int err = sched_yield();
+    __TBB_ASSERT_EX(err == 0, "sched_yiled has failed");
+}
+#else
+using std::this_thread::yield;
+#endif
+
+//--------------------------------------------------------------------------------------------------
+// atomic_fence implementation
+//--------------------------------------------------------------------------------------------------
+
+#if (_WIN32 || _WIN64)
+#pragma intrinsic(_mm_mfence)
+#endif
+
+static inline void atomic_fence(std::memory_order order) {
+#if (_WIN32 || _WIN64)
+    if (order == std::memory_order_seq_cst ||
+        order == std::memory_order_acq_rel ||
+        order == std::memory_order_acquire ||
+        order == std::memory_order_release )
+    {
+        _mm_mfence();
+        return;
+    }
+#endif /*(_WIN32 || _WIN64)*/
+    std::atomic_thread_fence(order);
+}
+
+//--------------------------------------------------------------------------------------------------
+// Pause implementation
+//--------------------------------------------------------------------------------------------------
+
+static inline void machine_pause(int32_t delay) {
+#if __TBB_x86_64 || __TBB_x86_32
+    while (delay-- > 0) { _mm_pause(); }
+#elif __ARM_ARCH_7A__ || __aarch64__
+    while (delay-- > 0) { __asm__ __volatile__("yield" ::: "memory"); }
+#else /* Generic */
+    (void)delay; // suppress without including _template_helpers.h
+    yield();
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// tbb::detail::log2() implementation
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// TODO: Use log2p1() function that will be available in C++20 standard
+
+#if defined(__GNUC__) || defined(__clang__)
+namespace gnu_builtins {
+    inline uintptr_t clz(unsigned int x) { return __builtin_clz(x); }
+    inline uintptr_t clz(unsigned long int x) { return __builtin_clzl(x); }
+    inline uintptr_t clz(unsigned long long int x) { return __builtin_clzll(x); }
+}
+#elif defined(_MSC_VER)
+#pragma intrinsic(__TBB_W(_BitScanReverse))
+namespace msvc_intrinsics {
+    static inline uintptr_t bit_scan_reverse(uintptr_t i) {
+        unsigned long j;
+        __TBB_W(_BitScanReverse)( &j, i );
+        return j;
+    }
+}
+#endif
+
+template <typename T>
+constexpr std::uintptr_t number_of_bits() {
+    return sizeof(T) * CHAR_BIT;
+}
+
+// logarithm is the index of the most significant non-zero bit
+static inline uintptr_t machine_log2(uintptr_t x) {
+#if defined(__GNUC__) || defined(__clang__)
+    // If P is a power of 2 and x<P, then (P-1)-x == (P-1) XOR x
+    return (number_of_bits<decltype(x)>() - 1) ^ gnu_builtins::clz(x);
+#elif defined(_MSC_VER)
+    return msvc_intrinsics::bit_scan_reverse(x);
+#elif __i386__ || __i386 /*for Sun OS*/ || __MINGW32__
+    uintptr_t j, i = x;
+    __asm__("bsr %1,%0" : "=r"(j) : "r"(i));
+    return j;
+#elif __powerpc__ || __POWERPC__
+    #if __TBB_WORDSIZE==8
+    __asm__ __volatile__ ("cntlzd %0,%0" : "+r"(x));
+    return 63 - static_cast<intptr_t>(x);
+    #else
+    __asm__ __volatile__ ("cntlzw %0,%0" : "+r"(x));
+    return 31 - static_cast<intptr_t>(x);
+    #endif /*__TBB_WORDSIZE*/
+#elif __sparc
+    uint64_t count;
+    // one hot encode
+    x |= (x >> 1);
+    x |= (x >> 2);
+    x |= (x >> 4);
+    x |= (x >> 8);
+    x |= (x >> 16);
+    x |= (x >> 32);
+    // count 1's
+    __asm__ ("popc %1, %0" : "=r"(count) : "r"(x) );
+    return count - 1;
+#else
+    intptr_t result = 0;
+
+    if( sizeof(x) > 4 && (uintptr_t tmp = x >> 32) ) { x = tmp; result += 32; }
+    if( uintptr_t tmp = x >> 16 ) { x = tmp; result += 16; }
+    if( uintptr_t tmp = x >> 8 )  { x = tmp; result += 8; }
+    if( uintptr_t tmp = x >> 4 )  { x = tmp; result += 4; }
+    if( uintptr_t tmp = x >> 2 )  { x = tmp; result += 2; }
+
+    return (x & 2) ? result + 1 : result;
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// tbb::detail::reverse_bits() implementation
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#if TBB_USE_CLANG_BITREVERSE_BUILTINS
+namespace  llvm_builtins {
+    inline uint8_t  builtin_bitreverse(uint8_t  x) { return __builtin_bitreverse8 (x); }
+    inline uint16_t builtin_bitreverse(uint16_t x) { return __builtin_bitreverse16(x); }
+    inline uint32_t builtin_bitreverse(uint32_t x) { return __builtin_bitreverse32(x); }
+    inline uint64_t builtin_bitreverse(uint64_t x) { return __builtin_bitreverse64(x); }
+}
+#else // generic
+template<typename T>
+struct reverse {
+    static const T byte_table[256];
+};
+
+template<typename T>
+const T reverse<T>::byte_table[256] = {
+    0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0,
+    0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8,
+    0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,
+    0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC,
+    0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2,
+    0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
+    0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
+    0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,
+    0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
+    0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9,
+    0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,
+    0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
+    0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,
+    0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
+    0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,
+    0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
+};
+
+inline unsigned char reverse_byte(unsigned char src) {
+    return reverse<unsigned char>::byte_table[src];
+}
+#endif // TBB_USE_CLANG_BITREVERSE_BUILTINS
+
+template<typename T>
+T machine_reverse_bits(T src) {
+#if TBB_USE_CLANG_BITREVERSE_BUILTINS
+    return builtin_bitreverse(fixed_width_cast(src));
+#else /* Generic */
+    T dst;
+    unsigned char *original = (unsigned char *) &src;
+    unsigned char *reversed = (unsigned char *) &dst;
+
+    for ( int i = sizeof(T) - 1; i >= 0; i-- ) {
+        reversed[i] = reverse_byte( original[sizeof(T) - i - 1] );
+    }
+
+    return dst;
+#endif // TBB_USE_CLANG_BITREVERSE_BUILTINS
+}
+
+} // inline namespace d0
+
+namespace d1 {
+
+#if (_WIN32 || _WIN64)
+// API to retrieve/update FPU control setting
+#define __TBB_CPU_CTL_ENV_PRESENT 1
+struct cpu_ctl_env {
+    unsigned int x87cw{};
+#if (__TBB_x86_64)
+    // Changing the infinity mode or the floating-point precision is not supported on x64.
+    // The attempt causes an assertion. See
+    // https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/control87-controlfp-control87-2
+    static constexpr unsigned int X87CW_CONTROL_MASK = _MCW_DN | _MCW_EM | _MCW_RC;
+#else
+    static constexpr unsigned int X87CW_CONTROL_MASK = ~0U;
+#endif
+#if (__TBB_x86_32 || __TBB_x86_64)
+    unsigned int mxcsr{};
+    static constexpr unsigned int MXCSR_CONTROL_MASK = ~0x3fu; /* all except last six status bits */
+#endif
+
+    bool operator!=( const cpu_ctl_env& ctl ) const {
+        return
+#if (__TBB_x86_32 || __TBB_x86_64)
+            mxcsr != ctl.mxcsr ||
+#endif
+            x87cw != ctl.x87cw;
+    }
+    void get_env() {
+        x87cw = _control87(0, 0);
+#if (__TBB_x86_32 || __TBB_x86_64)
+        mxcsr = _mm_getcsr();
+#endif
+    }
+    void set_env() const {
+        _control87(x87cw, X87CW_CONTROL_MASK);
+#if (__TBB_x86_32 || __TBB_x86_64)
+        _mm_setcsr(mxcsr & MXCSR_CONTROL_MASK);
+#endif
+    }
+};
+#elif (__TBB_x86_32 || __TBB_x86_64)
+// API to retrieve/update FPU control setting
+#define __TBB_CPU_CTL_ENV_PRESENT 1
+struct cpu_ctl_env {
+    int     mxcsr{};
+    short   x87cw{};
+    static const int MXCSR_CONTROL_MASK = ~0x3f; /* all except last six status bits */
+
+    bool operator!=(const cpu_ctl_env& ctl) const {
+        return mxcsr != ctl.mxcsr || x87cw != ctl.x87cw;
+    }
+    void get_env() {
+        __asm__ __volatile__(
+            "stmxcsr %0\n\t"
+            "fstcw %1"
+            : "=m"(mxcsr), "=m"(x87cw)
+        );
+        mxcsr &= MXCSR_CONTROL_MASK;
+    }
+    void set_env() const {
+        __asm__ __volatile__(
+            "ldmxcsr %0\n\t"
+            "fldcw %1"
+            : : "m"(mxcsr), "m"(x87cw)
+        );
+    }
+};
+#endif
+
+} // namespace d1
+
+} // namespace detail
+} // namespace tbb
+
+#if !__TBB_CPU_CTL_ENV_PRESENT
+#include <fenv.h>
+
+#include <cstring>
+
+namespace tbb {
+namespace detail {
+
+namespace r1 {
+void* __TBB_EXPORTED_FUNC cache_aligned_allocate(std::size_t size);
+void __TBB_EXPORTED_FUNC cache_aligned_deallocate(void* p);
+} // namespace r1
+
+namespace d1 {
+
+class cpu_ctl_env {
+    fenv_t *my_fenv_ptr;
+public:
+    cpu_ctl_env() : my_fenv_ptr(NULL) {}
+    ~cpu_ctl_env() {
+        if ( my_fenv_ptr )
+            r1::cache_aligned_deallocate( (void*)my_fenv_ptr );
+    }
+    // It is possible not to copy memory but just to copy pointers but the following issues should be addressed:
+    //   1. The arena lifetime and the context lifetime are independent;
+    //   2. The user is allowed to recapture different FPU settings to context so 'current FPU settings' inside
+    //   dispatch loop may become invalid.
+    // But do we really want to improve the fenv implementation? It seems to be better to replace the fenv implementation
+    // with a platform specific implementation.
+    cpu_ctl_env( const cpu_ctl_env &src ) : my_fenv_ptr(NULL) {
+        *this = src;
+    }
+    cpu_ctl_env& operator=( const cpu_ctl_env &src ) {
+        __TBB_ASSERT( src.my_fenv_ptr, NULL );
+        if ( !my_fenv_ptr )
+            my_fenv_ptr = (fenv_t*)r1::cache_aligned_allocate(sizeof(fenv_t));
+        *my_fenv_ptr = *src.my_fenv_ptr;
+        return *this;
+    }
+    bool operator!=( const cpu_ctl_env &ctl ) const {
+        __TBB_ASSERT( my_fenv_ptr, "cpu_ctl_env is not initialized." );
+        __TBB_ASSERT( ctl.my_fenv_ptr, "cpu_ctl_env is not initialized." );
+        return std::memcmp( (void*)my_fenv_ptr, (void*)ctl.my_fenv_ptr, sizeof(fenv_t) );
+    }
+    void get_env () {
+        if ( !my_fenv_ptr )
+            my_fenv_ptr = (fenv_t*)r1::cache_aligned_allocate(sizeof(fenv_t));
+        fegetenv( my_fenv_ptr );
+    }
+    const cpu_ctl_env& set_env () const {
+        __TBB_ASSERT( my_fenv_ptr, "cpu_ctl_env is not initialized." );
+        fesetenv( my_fenv_ptr );
+        return *this;
+    }
+};
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif /* !__TBB_CPU_CTL_ENV_PRESENT */
+
+#endif // __TBB_detail__machine_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_namespace_injection.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_namespace_injection.h
new file mode 100644
index 0000000000..2e1df30931
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_namespace_injection.h
@@ -0,0 +1,24 @@
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+// All public entities of the OneAPI Spec are available under oneapi namespace
+
+// Define tbb namespace first as it might not be known yet
+namespace tbb {}
+
+namespace oneapi {
+namespace tbb = ::tbb;
+}
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_node_handle.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_node_handle.h
new file mode 100644
index 0000000000..265be07555
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_node_handle.h
@@ -0,0 +1,162 @@
+/*
+    Copyright (c) 2019-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__node_handle_H
+#define __TBB_detail__node_handle_H
+
+#include "_allocator_traits.h"
+#include "_assert.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+// A structure to access private node handle methods in internal TBB classes
+// Regular friend declaration is not convenient because classes which use node handle
+// can be placed in the different versioning namespaces.
+struct node_handle_accessor {
+    template <typename NodeHandleType>
+    static typename NodeHandleType::node* get_node_ptr( NodeHandleType& nh ) {
+        return nh.get_node_ptr();
+    }
+
+    template <typename NodeHandleType>
+    static NodeHandleType construct( typename NodeHandleType::node* node_ptr ) {
+        return NodeHandleType{node_ptr};
+    }
+
+    template <typename NodeHandleType>
+    static void deactivate( NodeHandleType& nh ) {
+        nh.deactivate();
+    }
+}; // struct node_handle_accessor
+
+template<typename Value, typename Node, typename Allocator>
+class node_handle_base {
+public:
+    using allocator_type = Allocator;
+protected:
+    using node = Node;
+    using allocator_traits_type = tbb::detail::allocator_traits<allocator_type>;
+public:
+
+    node_handle_base() : my_node(nullptr), my_allocator() {}
+    node_handle_base(node_handle_base&& nh) : my_node(nh.my_node),
+                                              my_allocator(std::move(nh.my_allocator)) {
+        nh.my_node = nullptr;
+    }
+
+    __TBB_nodiscard bool empty() const { return my_node == nullptr; }
+    explicit operator bool() const { return my_node != nullptr; }
+
+    ~node_handle_base() { internal_destroy(); }
+
+    node_handle_base& operator=( node_handle_base&& nh ) {
+        internal_destroy();
+        my_node = nh.my_node;
+        move_assign_allocators(my_allocator, nh.my_allocator);
+        nh.deactivate();
+        return *this;
+    }
+
+    void swap( node_handle_base& nh ) {
+        using std::swap;
+        swap(my_node, nh.my_node);
+        swap_allocators(my_allocator, nh.my_allocator);
+    }
+
+    allocator_type get_allocator() const {
+        return my_allocator;
+    }
+
+protected:
+    node_handle_base( node* n ) : my_node(n) {}
+
+    void internal_destroy() {
+        if(my_node != nullptr) {
+            allocator_traits_type::destroy(my_allocator, my_node->storage());
+            typename allocator_traits_type::template rebind_alloc<node> node_allocator(my_allocator);
+            node_allocator.deallocate(my_node, 1);
+        }
+    }
+
+    node* get_node_ptr() { return my_node; }
+
+    void deactivate() { my_node = nullptr; }
+
+    node* my_node;
+    allocator_type my_allocator;
+};
+
+// node handle for maps
+template<typename Key, typename Value, typename Node, typename Allocator>
+class node_handle : public node_handle_base<Value, Node, Allocator> {
+    using base_type = node_handle_base<Value, Node, Allocator>;
+public:
+    using key_type = Key;
+    using mapped_type = typename Value::second_type;
+    using allocator_type = typename base_type::allocator_type;
+
+    node_handle() = default;
+
+    key_type& key() const {
+        __TBB_ASSERT(!this->empty(), "Cannot get key from the empty node_type object");
+        return *const_cast<key_type*>(&(this->my_node->value().first));
+    }
+
+    mapped_type& mapped() const {
+        __TBB_ASSERT(!this->empty(), "Cannot get mapped value from the empty node_type object");
+        return this->my_node->value().second;
+    }
+
+private:
+    friend struct node_handle_accessor;
+
+    node_handle( typename base_type::node* n ) : base_type(n) {}
+}; // class node_handle
+
+// node handle for sets
+template<typename Key, typename Node, typename Allocator>
+class node_handle<Key, Key, Node, Allocator> : public node_handle_base<Key, Node, Allocator> {
+    using base_type = node_handle_base<Key, Node, Allocator>;
+public:
+    using value_type = Key;
+    using allocator_type = typename base_type::allocator_type;
+
+    node_handle() = default;
+
+    value_type& value() const {
+        __TBB_ASSERT(!this->empty(), "Cannot get value from the empty node_type object");
+        return *const_cast<value_type*>(&(this->my_node->value()));
+    }
+
+private:
+    friend struct node_handle_accessor;
+
+    node_handle( typename base_type::node* n ) : base_type(n) {}
+}; // class node_handle
+
+template <typename Key, typename Value, typename Node, typename Allocator>
+void swap( node_handle<Key, Value, Node, Allocator>& lhs,
+           node_handle<Key, Value, Node, Allocator>& rhs ) {
+    return lhs.swap(rhs);
+}
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__node_handle_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_pipeline_filters.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_pipeline_filters.h
new file mode 100644
index 0000000000..95a4d3dc96
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_pipeline_filters.h
@@ -0,0 +1,453 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_parallel_filters_H
+#define __TBB_parallel_filters_H
+
+#include "_config.h"
+#include "_task.h"
+#include "_pipeline_filters_deduction.h"
+#include "../tbb_allocator.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+class base_filter;
+}
+
+namespace r1 {
+void __TBB_EXPORTED_FUNC set_end_of_input(d1::base_filter&);
+class pipeline;
+class stage_task;
+class input_buffer;
+}
+
+namespace d1 {
+class filter_node;
+
+//! A stage in a pipeline.
+/** @ingroup algorithms */
+class base_filter{
+private:
+    //! Value used to mark "not in pipeline"
+    static base_filter* not_in_pipeline() { return reinterpret_cast<base_filter*>(std::intptr_t(-1)); }
+public:
+    //! The lowest bit 0 is for parallel vs serial
+    static constexpr  unsigned int filter_is_serial = 0x1;
+
+    //! 2nd bit distinguishes ordered vs unordered filters.
+    static constexpr  unsigned int filter_is_out_of_order = 0x1<<1;
+
+    //! 3rd bit marks input filters emitting small objects
+    static constexpr  unsigned int filter_may_emit_null = 0x1<<2;
+
+    base_filter(const base_filter&) = delete;
+    base_filter& operator=(const base_filter&) = delete;
+
+protected:
+    explicit base_filter( unsigned int m ) :
+        next_filter_in_pipeline(not_in_pipeline()),
+        my_input_buffer(nullptr),
+        my_filter_mode(m),
+        my_pipeline(nullptr)
+    {}
+
+    // signal end-of-input for concrete_filters
+    void set_end_of_input() {
+        r1::set_end_of_input(*this);
+    }
+
+public:
+    //! True if filter is serial.
+    bool is_serial() const {
+        return bool( my_filter_mode & filter_is_serial );
+    }
+
+    //! True if filter must receive stream in order.
+    bool is_ordered() const {
+        return (my_filter_mode & filter_is_serial) && !(my_filter_mode & filter_is_out_of_order);
+    }
+
+    //! true if an input filter can emit null
+    bool object_may_be_null() {
+        return ( my_filter_mode & filter_may_emit_null ) == filter_may_emit_null;
+    }
+
+    //! Operate on an item from the input stream, and return item for output stream.
+    /** Returns nullptr if filter is a sink. */
+    virtual void* operator()( void* item ) = 0;
+
+    //! Destroy filter.
+    virtual ~base_filter() {};
+
+    //! Destroys item if pipeline was cancelled.
+    /** Required to prevent memory leaks.
+        Note it can be called concurrently even for serial filters.*/
+    virtual void finalize( void* /*item*/ ) {}
+
+private:
+    //! Pointer to next filter in the pipeline.
+    base_filter* next_filter_in_pipeline;
+
+    //! Buffer for incoming tokens, or nullptr if not required.
+    /** The buffer is required if the filter is serial. */
+    r1::input_buffer* my_input_buffer;
+
+    friend class r1::stage_task;
+    friend class r1::pipeline;
+    friend void r1::set_end_of_input(d1::base_filter&);
+
+    //! Storage for filter mode and dynamically checked implementation version.
+    const unsigned int my_filter_mode;
+
+    //! Pointer to the pipeline.
+    r1::pipeline* my_pipeline;
+};
+
+template<typename Body, typename InputType, typename OutputType >
+class concrete_filter;
+
+//! input_filter control to signal end-of-input for parallel_pipeline
+class flow_control {
+    bool is_pipeline_stopped = false;
+    flow_control() = default;
+    template<typename Body, typename InputType, typename OutputType > friend class concrete_filter;
+    template<typename Output> friend class input_node;
+public:
+    void stop() { is_pipeline_stopped = true; }
+};
+
+// Emulate std::is_trivially_copyable (false positives not allowed, false negatives suboptimal but safe).
+#if __TBB_CPP11_TYPE_PROPERTIES_PRESENT
+template<typename T> using tbb_trivially_copyable = std::is_trivially_copyable<T>;
+#else
+template<typename T> struct tbb_trivially_copyable                      { enum { value = false }; };
+template<typename T> struct tbb_trivially_copyable <         T*       > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         bool     > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         char     > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <  signed char     > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <unsigned char     > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         short    > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <unsigned short    > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         int      > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <unsigned int      > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         long     > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <unsigned long     > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         long long> { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <unsigned long long> { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         float    > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         double   > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <    long double   > { enum { value = true  }; };
+#endif // __TBB_CPP11_TYPE_PROPERTIES_PRESENT
+
+template<typename T>
+struct use_allocator {
+   static constexpr bool value = sizeof(T) > sizeof(void *) || !tbb_trivially_copyable<T>::value;
+};
+
+// A helper class to customize how a type is passed between filters.
+// Usage: token_helper<T, use_allocator<T>::value>
+template<typename T, bool Allocate> struct token_helper;
+
+// using tbb_allocator
+template<typename T>
+struct token_helper<T, true> {
+    using pointer = T*;
+    using value_type = T;
+    static pointer create_token(value_type && source) {
+        return new (r1::allocate_memory(sizeof(T))) T(std::move(source));
+    }
+    static value_type & token(pointer & t) { return *t; }
+    static void * cast_to_void_ptr(pointer ref) { return reinterpret_cast<void *>(ref); }
+    static pointer cast_from_void_ptr(void * ref) { return reinterpret_cast<pointer>(ref); }
+    static void destroy_token(pointer token) {
+        token->~value_type();
+        r1::deallocate_memory(token);
+    }
+};
+
+// pointer specialization
+template<typename T>
+struct token_helper<T*, false> {
+    using pointer = T*;
+    using value_type = T*;
+    static pointer create_token(const value_type & source) { return source; }
+    static value_type & token(pointer & t) { return t; }
+    static void * cast_to_void_ptr(pointer ref) { return reinterpret_cast<void *>(ref); }
+    static pointer cast_from_void_ptr(void * ref) { return reinterpret_cast<pointer>(ref); }
+    static void destroy_token( pointer /*token*/) {}
+};
+
+// converting type to and from void*, passing objects directly
+template<typename T>
+struct token_helper<T, false> {
+    typedef union {
+        T actual_value;
+        void * void_overlay;
+    } type_to_void_ptr_map;
+    using pointer = T;  // not really a pointer in this case.
+    using value_type = T;
+    static pointer create_token(const value_type & source) { return source; }
+    static value_type & token(pointer & t) { return t; }
+    static void * cast_to_void_ptr(pointer ref) {
+        type_to_void_ptr_map mymap;
+        mymap.void_overlay = nullptr;
+        mymap.actual_value = ref;
+        return mymap.void_overlay;
+    }
+    static pointer cast_from_void_ptr(void * ref) {
+        type_to_void_ptr_map mymap;
+        mymap.void_overlay = ref;
+        return mymap.actual_value;
+    }
+    static void destroy_token( pointer /*token*/) {}
+};
+
+// intermediate
+template<typename InputType,  typename OutputType, typename Body>
+class concrete_filter: public base_filter {
+    const Body& my_body;
+    using input_helper = token_helper<InputType, use_allocator<InputType >::value>;
+    using input_pointer = typename input_helper::pointer;
+    using output_helper = token_helper<OutputType, use_allocator<OutputType>::value>;
+    using output_pointer = typename output_helper::pointer;
+
+    void* operator()(void* input) override {
+        input_pointer temp_input = input_helper::cast_from_void_ptr(input);
+        output_pointer temp_output = output_helper::create_token(my_body(std::move(input_helper::token(temp_input))));
+        input_helper::destroy_token(temp_input);
+        return output_helper::cast_to_void_ptr(temp_output);
+    }
+
+    void finalize(void * input) override {
+        input_pointer temp_input = input_helper::cast_from_void_ptr(input);
+        input_helper::destroy_token(temp_input);
+    }
+
+public:
+    concrete_filter(unsigned int m, const Body& body) : base_filter(m), my_body(body) {}
+};
+
+// input
+template<typename OutputType, typename Body>
+class concrete_filter<void, OutputType, Body>: public base_filter {
+    const Body& my_body;
+    using output_helper = token_helper<OutputType, use_allocator<OutputType>::value>;
+    using output_pointer = typename output_helper::pointer;
+
+    void* operator()(void*) override {
+        flow_control control;
+        output_pointer temp_output = output_helper::create_token(my_body(control));
+        if(control.is_pipeline_stopped) {
+            output_helper::destroy_token(temp_output);
+            set_end_of_input();
+            return nullptr;
+        }
+        return output_helper::cast_to_void_ptr(temp_output);
+    }
+
+public:
+    concrete_filter(unsigned int m, const Body& body) :
+        base_filter(m | filter_may_emit_null),
+        my_body(body)
+    {}
+};
+
+// output
+template<typename InputType, typename Body>
+class concrete_filter<InputType, void, Body>: public base_filter {
+    const Body& my_body;
+    using input_helper = token_helper<InputType, use_allocator<InputType >::value>;
+    using input_pointer = typename input_helper::pointer;
+
+    void* operator()(void* input) override {
+        input_pointer temp_input = input_helper::cast_from_void_ptr(input);
+        my_body(std::move(input_helper::token(temp_input)));
+        input_helper::destroy_token(temp_input);
+        return nullptr;
+    }
+    void finalize(void* input) override {
+        input_pointer temp_input = input_helper::cast_from_void_ptr(input);
+        input_helper::destroy_token(temp_input);
+    }
+
+public:
+    concrete_filter(unsigned int m, const Body& body) : base_filter(m), my_body(body) {}
+};
+
+template<typename Body>
+class concrete_filter<void, void, Body>: public base_filter {
+    const Body& my_body;
+
+    void* operator()(void*) override {
+        flow_control control;
+        my_body(control);
+        void* output = control.is_pipeline_stopped ? nullptr : (void*)(std::intptr_t)-1;
+        return output;
+    }
+public:
+    concrete_filter(unsigned int m, const Body& body) : base_filter(m), my_body(body) {}
+};
+
+class filter_node_ptr {
+    filter_node * my_node;
+
+public:
+    filter_node_ptr() : my_node(nullptr) {}
+    filter_node_ptr(filter_node *);
+    ~filter_node_ptr();
+    filter_node_ptr(const filter_node_ptr &);
+    filter_node_ptr(filter_node_ptr &&);
+    void operator=(filter_node *);
+    void operator=(const filter_node_ptr &);
+    void operator=(filter_node_ptr &&);
+    filter_node& operator*() const;
+    operator bool() const;
+};
+
+//! Abstract base class that represents a node in a parse tree underlying a filter class.
+/** These nodes are always heap-allocated and can be shared by filter objects. */
+class filter_node {
+    /** Count must be atomic because it is hidden state for user, but might be shared by threads. */
+    std::atomic<std::intptr_t> ref_count;
+public:
+    filter_node_ptr left;
+    filter_node_ptr right;
+protected:
+    filter_node() : ref_count(0), left(nullptr), right(nullptr) {
+#ifdef __TBB_TEST_FILTER_NODE_COUNT
+        ++(__TBB_TEST_FILTER_NODE_COUNT);
+#endif
+    }
+public:
+    filter_node(const filter_node_ptr& x, const filter_node_ptr& y) : filter_node(){
+        left = x;
+        right = y;
+    }
+    filter_node(const filter_node&) = delete;
+    filter_node& operator=(const filter_node&) = delete;
+
+    //! Add concrete_filter to pipeline
+    virtual base_filter* create_filter() const {
+        __TBB_ASSERT(false, "method of non-leaf was called");
+        return nullptr;
+    }
+
+    //! Increment reference count
+    void add_ref() { ref_count.fetch_add(1, std::memory_order_relaxed); }
+
+    //! Decrement reference count and delete if it becomes zero.
+    void remove_ref() {
+        __TBB_ASSERT(ref_count>0,"ref_count underflow");
+        if( ref_count.fetch_sub(1, std::memory_order_relaxed) == 1 ) {
+            this->~filter_node();
+            r1::deallocate_memory(this);
+        }
+    }
+
+    virtual ~filter_node() {
+#ifdef __TBB_TEST_FILTER_NODE_COUNT
+        --(__TBB_TEST_FILTER_NODE_COUNT);
+#endif
+    }
+};
+
+inline filter_node_ptr::filter_node_ptr(filter_node * nd) : my_node(nd) {
+    if (my_node) {
+        my_node->add_ref();
+    }
+}
+
+inline filter_node_ptr::~filter_node_ptr() {
+    if (my_node) {
+        my_node->remove_ref();
+    }
+}
+
+inline filter_node_ptr::filter_node_ptr(const filter_node_ptr & rhs) : my_node(rhs.my_node) {
+    if (my_node) {
+        my_node->add_ref();
+    }
+}
+
+inline filter_node_ptr::filter_node_ptr(filter_node_ptr && rhs) : my_node(rhs.my_node) {
+    rhs.my_node = nullptr;
+}
+
+inline void filter_node_ptr::operator=(filter_node * rhs) {
+    // Order of operations below carefully chosen so that reference counts remain correct
+    // in unlikely event that remove_ref throws exception.
+    filter_node* old = my_node;
+    my_node = rhs;
+    if (my_node) {
+        my_node->add_ref();
+    }
+    if (old) {
+        old->remove_ref();
+    }
+}
+
+inline void filter_node_ptr::operator=(const filter_node_ptr & rhs) {
+    *this = rhs.my_node;
+}
+
+inline void filter_node_ptr::operator=(filter_node_ptr && rhs) {
+    filter_node* old = my_node;
+    my_node = rhs.my_node;
+    rhs.my_node = nullptr;
+    if (old) {
+        old->remove_ref();
+    }
+}
+
+inline filter_node& filter_node_ptr::operator*() const{
+    __TBB_ASSERT(my_node,"NULL node is used");
+    return *my_node;
+}
+
+inline filter_node_ptr::operator bool() const {
+    return my_node != nullptr;
+}
+
+//! Node in parse tree representing result of make_filter.
+template<typename InputType, typename OutputType, typename Body>
+class filter_node_leaf: public filter_node {
+    const unsigned int my_mode;
+    const Body my_body;
+    base_filter* create_filter() const override {
+        return new(r1::allocate_memory(sizeof(concrete_filter<InputType, OutputType, Body>))) concrete_filter<InputType, OutputType, Body>(my_mode,my_body);
+    }
+public:
+    filter_node_leaf( unsigned int m, const Body& b ) : my_mode(m), my_body(b) {}
+};
+
+
+template <typename Body, typename Input = typename body_types<decltype(&Body::operator())>::input_type>
+using filter_input = typename std::conditional<std::is_same<Input, flow_control>::value, void, Input>::type;
+
+template <typename Body>
+using filter_output = typename body_types<decltype(&Body::operator())>::output_type;
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+
+#endif /* __TBB_parallel_filters_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_pipeline_filters_deduction.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_pipeline_filters_deduction.h
new file mode 100644
index 0000000000..55f94dce00
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_pipeline_filters_deduction.h
@@ -0,0 +1,46 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__pipeline_filters_deduction_H
+#define __TBB__pipeline_filters_deduction_H
+
+#include "_config.h"
+#include <utility>
+#include <type_traits>
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename Input, typename Output>
+struct declare_fitler_types {
+    using input_type = typename std::remove_const<typename std::remove_reference<Input>::type>::type;
+    using output_type = typename std::remove_const<typename std::remove_reference<Output>::type>::type;
+};
+
+template <typename T> struct body_types;
+
+template <typename T, typename Input, typename Output>
+struct body_types<Output(T::*)(Input) const> : declare_fitler_types<Input, Output> {};
+
+template <typename T, typename Input, typename Output>
+struct body_types<Output(T::*)(Input)> : declare_fitler_types<Input, Output> {};
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB__pipeline_filters_deduction_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_range_common.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_range_common.h
new file mode 100644
index 0000000000..36c4ca84ee
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_range_common.h
@@ -0,0 +1,76 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__range_common_H
+#define __TBB_detail__range_common_H
+
+#include "_config.h"
+#include "_utils.h"
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+
+//! Dummy type that distinguishes splitting constructor from copy constructor.
+/**
+ * See description of parallel_for and parallel_reduce for example usages.
+ * @ingroup algorithms
+ */
+class split {};
+
+//! Type enables transmission of splitting proportion from partitioners to range objects
+/**
+ * In order to make use of such facility Range objects must implement
+ * splitting constructor with this type passed.
+ */
+class proportional_split : no_assign {
+public:
+    proportional_split(size_t _left = 1, size_t _right = 1) : my_left(_left), my_right(_right) { }
+
+    size_t left() const { return my_left; }
+    size_t right() const { return my_right; }
+
+    // used when range does not support proportional split
+    explicit operator split() const { return split(); }
+
+private:
+    size_t my_left, my_right;
+};
+
+template <typename Range, typename = void>
+struct range_split_object_provider {
+    template <typename PartitionerSplitType>
+    static split get( PartitionerSplitType& ) { return split(); }
+};
+
+template <typename Range>
+struct range_split_object_provider<Range,
+                                   typename std::enable_if<std::is_constructible<Range, Range&, proportional_split&>::value>::type> {
+    template <typename PartitionerSplitType>
+    static PartitionerSplitType& get( PartitionerSplitType& split_obj ) { return split_obj; }
+};
+
+template <typename Range, typename PartitionerSplitType>
+auto get_range_split_object( PartitionerSplitType& split_obj )
+-> decltype(range_split_object_provider<Range>::get(split_obj)) {
+    return range_split_object_provider<Range>::get(split_obj);
+}
+
+} // namespace d0
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__range_common_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_rtm_mutex.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_rtm_mutex.h
new file mode 100644
index 0000000000..28ef9f042e
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_rtm_mutex.h
@@ -0,0 +1,162 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__rtm_mutex_impl_H
+#define __TBB__rtm_mutex_impl_H
+
+#include "_assert.h"
+#include "_utils.h"
+#include "../spin_mutex.h"
+
+#include "../profiling.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+struct rtm_mutex_impl;
+}
+namespace d1 {
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Suppress warning: structure was padded due to alignment specifier
+    #pragma warning (push)
+    #pragma warning (disable: 4324)
+#endif
+
+/** A rtm_mutex is an speculation-enabled spin mutex.
+    It should be used for locking short critical sections where the lock is
+    contended but the data it protects are not.  If zero-initialized, the
+    mutex is considered unheld.
+    @ingroup synchronization */
+class alignas(max_nfs_size) rtm_mutex : private spin_mutex {
+private:
+    enum class rtm_state {
+        rtm_none,
+        rtm_transacting,
+        rtm_real
+    };
+public:
+    //! Constructors
+    rtm_mutex() noexcept {
+        create_itt_sync(this, "tbb::speculative_spin_mutex", "");
+    }
+
+    //! Destructor
+    ~rtm_mutex() = default;
+
+    //! Represents acquisition of a mutex.
+    class scoped_lock {
+    public:
+        friend class rtm_mutex;
+        //! Construct lock that has not acquired a mutex.
+        /** Equivalent to zero-initialization of *this. */
+        constexpr scoped_lock() : m_mutex(nullptr), m_transaction_state(rtm_state::rtm_none) {}
+
+        //! Acquire lock on given mutex.
+        scoped_lock(rtm_mutex& m) : m_mutex(nullptr), m_transaction_state(rtm_state::rtm_none) {
+            acquire(m);
+        }
+
+        //! Release lock (if lock is held).
+        ~scoped_lock() {
+            if(m_transaction_state != rtm_state::rtm_none) {
+                release();
+            }
+        }
+
+        //! No Copy
+        scoped_lock(const scoped_lock&) = delete;
+        scoped_lock& operator=(const scoped_lock&) = delete;
+
+        //! Acquire lock on given mutex.
+        void acquire(rtm_mutex& m);
+
+        //! Try acquire lock on given mutex.
+        bool try_acquire(rtm_mutex& m);
+
+        //! Release lock
+        void release();
+
+    private:
+        rtm_mutex* m_mutex;
+        rtm_state m_transaction_state;
+        friend r1::rtm_mutex_impl;
+    };
+
+    //! Mutex traits
+    static constexpr bool is_rw_mutex = false;
+    static constexpr bool is_recursive_mutex = false;
+    static constexpr bool is_fair_mutex = false;
+private:
+    friend r1::rtm_mutex_impl;
+}; // end of rtm_mutex
+} // namespace d1
+
+namespace r1 {
+    //! Internal acquire lock.
+    // only_speculate == true if we're doing a try_lock, else false.
+    void __TBB_EXPORTED_FUNC acquire(d1::rtm_mutex&, d1::rtm_mutex::scoped_lock&, bool only_speculate = false);
+    //! Internal try_acquire lock.
+    bool __TBB_EXPORTED_FUNC try_acquire(d1::rtm_mutex&, d1::rtm_mutex::scoped_lock&);
+    //! Internal release lock.
+    void __TBB_EXPORTED_FUNC release(d1::rtm_mutex::scoped_lock&);
+} // namespace r1
+
+namespace d1 {
+//! Acquire lock on given mutex.
+inline void rtm_mutex::scoped_lock::acquire(rtm_mutex& m) {
+    __TBB_ASSERT(!m_mutex, "lock is already acquired");
+    r1::acquire(m, *this);
+}
+
+//! Try acquire lock on given mutex.
+inline bool rtm_mutex::scoped_lock::try_acquire(rtm_mutex& m) {
+    __TBB_ASSERT(!m_mutex, "lock is already acquired");
+    return r1::try_acquire(m, *this);
+}
+
+//! Release lock
+inline void rtm_mutex::scoped_lock::release() {
+    __TBB_ASSERT(m_mutex, "lock is not acquired");
+    __TBB_ASSERT(m_transaction_state != rtm_state::rtm_none, "lock is not acquired");
+    return r1::release(*this);
+}
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    #pragma warning (pop) // 4324 warning
+#endif
+
+#if TBB_USE_PROFILING_TOOLS
+inline void set_name(rtm_mutex& obj, const char* name) {
+    itt_set_sync_name(&obj, name);
+}
+#if (_WIN32||_WIN64) && !__MINGW32__
+inline void set_name(rtm_mutex& obj, const wchar_t* name) {
+    itt_set_sync_name(&obj, name);
+}
+#endif // WIN
+#else
+inline void set_name(rtm_mutex&, const char*) {}
+#if (_WIN32||_WIN64) && !__MINGW32__
+inline void set_name(rtm_mutex&, const wchar_t*) {}
+#endif // WIN
+#endif
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB__rtm_mutex_impl_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_rtm_rw_mutex.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_rtm_rw_mutex.h
new file mode 100644
index 0000000000..b62e86bd0a
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_rtm_rw_mutex.h
@@ -0,0 +1,209 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__rtm_rw_mutex_H
+#define __TBB_detail__rtm_rw_mutex_H
+
+#include "_assert.h"
+#include "_utils.h"
+#include "../spin_rw_mutex.h"
+
+#include <atomic>
+
+namespace tbb {
+namespace detail {
+
+namespace r1 {
+struct rtm_rw_mutex_impl;
+}
+
+namespace d1 {
+
+constexpr std::size_t speculation_granularity = 64;
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Suppress warning: structure was padded due to alignment specifier
+    #pragma warning (push)
+    #pragma warning (disable: 4324)
+#endif
+
+//! Fast, unfair, spinning speculation-enabled reader-writer lock with backoff and writer-preference
+/** @ingroup synchronization */
+class alignas(max_nfs_size) rtm_rw_mutex : private spin_rw_mutex {
+    friend struct r1::rtm_rw_mutex_impl;
+private:
+    enum class rtm_type {
+        rtm_not_in_mutex,
+        rtm_transacting_reader,
+        rtm_transacting_writer,
+        rtm_real_reader,
+        rtm_real_writer
+    };
+public:
+    //! Constructors
+    rtm_rw_mutex() noexcept : write_flag(false) {
+        create_itt_sync(this, "tbb::speculative_spin_rw_mutex", "");
+    }
+
+    //! Destructor
+    ~rtm_rw_mutex() = default;
+
+    //! Represents acquisition of a mutex.
+    class scoped_lock {
+        friend struct r1::rtm_rw_mutex_impl;
+    public:
+        //! Construct lock that has not acquired a mutex.
+        /** Equivalent to zero-initialization of *this. */
+        constexpr scoped_lock() : m_mutex(nullptr), m_transaction_state(rtm_type::rtm_not_in_mutex) {}
+
+        //! Acquire lock on given mutex.
+        scoped_lock(rtm_rw_mutex& m, bool write = true) : m_mutex(nullptr), m_transaction_state(rtm_type::rtm_not_in_mutex) {
+            acquire(m, write);
+        }
+
+        //! Release lock (if lock is held).
+        ~scoped_lock() {
+            if(m_transaction_state != rtm_type::rtm_not_in_mutex) {
+                release();
+            }
+        }
+
+        //! No Copy
+        scoped_lock(const scoped_lock&) = delete;
+        scoped_lock& operator=(const scoped_lock&) = delete;
+
+        //! Acquire lock on given mutex.
+        inline void acquire(rtm_rw_mutex& m, bool write = true);
+
+        //! Try acquire lock on given mutex.
+        inline bool try_acquire(rtm_rw_mutex& m, bool write = true);
+
+        //! Release lock
+        inline void release();
+
+        //! Upgrade reader to become a writer.
+        /** Returns whether the upgrade happened without releasing and re-acquiring the lock */
+        inline bool upgrade_to_writer();
+
+        //! Downgrade writer to become a reader.
+        inline bool downgrade_to_reader();
+
+    private:
+        rtm_rw_mutex* m_mutex;
+        rtm_type m_transaction_state;
+    };
+
+    //! Mutex traits
+    static constexpr bool is_rw_mutex = true;
+    static constexpr bool is_recursive_mutex = false;
+    static constexpr bool is_fair_mutex = false;
+
+private:
+    alignas(speculation_granularity) std::atomic<bool> write_flag;
+};
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    #pragma warning (pop) // 4324 warning
+#endif
+
+} // namespace d1
+
+namespace r1 {
+    //! Internal acquire write lock.
+    // only_speculate == true if we're doing a try_lock, else false.
+    void __TBB_EXPORTED_FUNC acquire_writer(d1::rtm_rw_mutex&, d1::rtm_rw_mutex::scoped_lock&, bool only_speculate = false);
+    //! Internal acquire read lock.
+    // only_speculate == true if we're doing a try_lock, else false.
+    void __TBB_EXPORTED_FUNC acquire_reader(d1::rtm_rw_mutex&, d1::rtm_rw_mutex::scoped_lock&, bool only_speculate = false);
+    //! Internal upgrade reader to become a writer.
+    bool __TBB_EXPORTED_FUNC upgrade(d1::rtm_rw_mutex::scoped_lock&);
+    //! Internal downgrade writer to become a reader.
+    bool __TBB_EXPORTED_FUNC downgrade(d1::rtm_rw_mutex::scoped_lock&);
+    //! Internal try_acquire write lock.
+    bool __TBB_EXPORTED_FUNC try_acquire_writer(d1::rtm_rw_mutex&, d1::rtm_rw_mutex::scoped_lock&);
+    //! Internal try_acquire read lock.
+    bool __TBB_EXPORTED_FUNC try_acquire_reader(d1::rtm_rw_mutex&, d1::rtm_rw_mutex::scoped_lock&);
+    //! Internal release lock.
+    void __TBB_EXPORTED_FUNC release(d1::rtm_rw_mutex::scoped_lock&);
+}
+
+namespace d1 {
+//! Acquire lock on given mutex.
+void rtm_rw_mutex::scoped_lock::acquire(rtm_rw_mutex& m, bool write) {
+    __TBB_ASSERT(!m_mutex, "lock is already acquired");
+    if (write) {
+        r1::acquire_writer(m, *this);
+    } else {
+        r1::acquire_reader(m, *this);
+    }
+}
+
+//! Try acquire lock on given mutex.
+bool rtm_rw_mutex::scoped_lock::try_acquire(rtm_rw_mutex& m, bool write) {
+    __TBB_ASSERT(!m_mutex, "lock is already acquired");
+    if (write) {
+        return r1::try_acquire_writer(m, *this);
+    } else {
+        return r1::try_acquire_reader(m, *this);
+    }
+}
+
+//! Release lock
+void rtm_rw_mutex::scoped_lock::release() {
+    __TBB_ASSERT(m_mutex, "lock is not acquired");
+    __TBB_ASSERT(m_transaction_state != rtm_type::rtm_not_in_mutex, "lock is not acquired");
+    return r1::release(*this);
+}
+
+//! Upgrade reader to become a writer.
+/** Returns whether the upgrade happened without releasing and re-acquiring the lock */
+bool rtm_rw_mutex::scoped_lock::upgrade_to_writer() {
+    __TBB_ASSERT(m_mutex, "lock is not acquired");
+    if (m_transaction_state == rtm_type::rtm_transacting_writer || m_transaction_state == rtm_type::rtm_real_writer) {
+        return true; // Already a writer
+    }
+    return r1::upgrade(*this);
+}
+
+//! Downgrade writer to become a reader.
+bool rtm_rw_mutex::scoped_lock::downgrade_to_reader() {
+    __TBB_ASSERT(m_mutex, "lock is not acquired");
+    if (m_transaction_state == rtm_type::rtm_transacting_reader || m_transaction_state == rtm_type::rtm_real_reader) {
+        return true; // Already a reader
+    }
+    return r1::downgrade(*this);
+}
+
+#if TBB_USE_PROFILING_TOOLS
+inline void set_name(rtm_rw_mutex& obj, const char* name) {
+    itt_set_sync_name(&obj, name);
+}
+#if (_WIN32||_WIN64) && !__MINGW32__
+inline void set_name(rtm_rw_mutex& obj, const wchar_t* name) {
+    itt_set_sync_name(&obj, name);
+}
+#endif // WIN
+#else
+inline void set_name(rtm_rw_mutex&, const char*) {}
+#if (_WIN32||_WIN64) && !__MINGW32__
+inline void set_name(rtm_rw_mutex&, const wchar_t*) {}
+#endif // WIN
+#endif
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__rtm_rw_mutex_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_segment_table.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_segment_table.h
new file mode 100644
index 0000000000..480ec8135e
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_segment_table.h
@@ -0,0 +1,563 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__segment_table_H
+#define __TBB_detail__segment_table_H
+
+#include "_config.h"
+#include "_allocator_traits.h"
+#include "_template_helpers.h"
+#include "_utils.h"
+#include "_assert.h"
+#include "_exception.h"
+#include <atomic>
+#include <type_traits>
+#include <memory>
+#include <cstring>
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#pragma warning(push)
+#pragma warning(disable: 4127) // warning C4127: conditional expression is constant
+#endif
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename T, typename Allocator, typename DerivedType, std::size_t PointersPerEmbeddedTable>
+class segment_table {
+public:
+    using value_type = T;
+    using segment_type = T*;
+    using atomic_segment = std::atomic<segment_type>;
+    using segment_table_type = atomic_segment*;
+
+    using size_type = std::size_t;
+    using segment_index_type = std::size_t;
+
+    using allocator_type = Allocator;
+
+    using allocator_traits_type = tbb::detail::allocator_traits<allocator_type>;
+    using segment_table_allocator_type = typename allocator_traits_type::template rebind_alloc<atomic_segment>;
+protected:
+    using segment_table_allocator_traits = tbb::detail::allocator_traits<segment_table_allocator_type>;
+    using derived_type = DerivedType;
+
+    static constexpr size_type pointers_per_embedded_table = PointersPerEmbeddedTable;
+    static constexpr size_type pointers_per_long_table = sizeof(size_type) * 8;
+public:
+    segment_table( const allocator_type& alloc = allocator_type() )
+        : my_segment_table_allocator(alloc), my_segment_table(my_embedded_table)
+        , my_first_block{}, my_size{}, my_segment_table_allocation_failed{}
+    {
+        zero_table(my_embedded_table, pointers_per_embedded_table);
+    }
+
+    segment_table( const segment_table& other )
+        : my_segment_table_allocator(segment_table_allocator_traits::
+                                     select_on_container_copy_construction(other.my_segment_table_allocator))
+        , my_segment_table(my_embedded_table), my_first_block{}, my_size{}, my_segment_table_allocation_failed{}
+    {
+        zero_table(my_embedded_table, pointers_per_embedded_table);
+        try_call( [&] {
+            internal_transfer(other, copy_segment_body_type{*this});
+        } ).on_exception( [&] {
+            clear();
+        });
+    }
+
+    segment_table( const segment_table& other, const allocator_type& alloc )
+        : my_segment_table_allocator(alloc), my_segment_table(my_embedded_table)
+        , my_first_block{}, my_size{}, my_segment_table_allocation_failed{}
+    {
+        zero_table(my_embedded_table, pointers_per_embedded_table);
+        try_call( [&] {
+            internal_transfer(other, copy_segment_body_type{*this});
+        } ).on_exception( [&] {
+            clear();
+        });
+    }
+
+    segment_table( segment_table&& other )
+        : my_segment_table_allocator(std::move(other.my_segment_table_allocator)), my_segment_table(my_embedded_table)
+        , my_first_block{}, my_size{}, my_segment_table_allocation_failed{}
+    {
+        zero_table(my_embedded_table, pointers_per_embedded_table);
+        internal_move(std::move(other));
+    }
+
+    segment_table( segment_table&& other, const allocator_type& alloc )
+        : my_segment_table_allocator(alloc), my_segment_table(my_embedded_table), my_first_block{}
+        , my_size{}, my_segment_table_allocation_failed{}
+    {
+        zero_table(my_embedded_table, pointers_per_embedded_table);
+        using is_equal_type = typename segment_table_allocator_traits::is_always_equal;
+        internal_move_construct_with_allocator(std::move(other), alloc, is_equal_type());
+    }
+
+    ~segment_table() {
+        clear();
+    }
+
+    segment_table& operator=( const segment_table& other ) {
+        if (this != &other) {
+            copy_assign_allocators(my_segment_table_allocator, other.my_segment_table_allocator);
+            internal_transfer(other, copy_segment_body_type{*this});
+        }
+        return *this;
+    }
+
+    segment_table& operator=( segment_table&& other ) 
+        noexcept(derived_type::is_noexcept_assignment)
+    {
+        using pocma_type = typename segment_table_allocator_traits::propagate_on_container_move_assignment;
+        using is_equal_type = typename segment_table_allocator_traits::is_always_equal;
+
+        if (this != &other) {
+            move_assign_allocators(my_segment_table_allocator, other.my_segment_table_allocator);
+            internal_move_assign(std::move(other), tbb::detail::disjunction<is_equal_type, pocma_type>());
+        }
+        return *this;
+    }
+
+    void swap( segment_table& other ) 
+        noexcept(derived_type::is_noexcept_swap)
+    {
+        using is_equal_type = typename segment_table_allocator_traits::is_always_equal;
+        using pocs_type = typename segment_table_allocator_traits::propagate_on_container_swap;
+
+        if (this != &other) {
+            swap_allocators(my_segment_table_allocator, other.my_segment_table_allocator);
+            internal_swap(other, tbb::detail::disjunction<is_equal_type, pocs_type>());
+        }
+    }
+
+    segment_type get_segment( segment_index_type index ) const {
+        return get_table()[index] + segment_base(index);
+    }
+
+    value_type& operator[]( size_type index ) {
+        return internal_subscript<true>(index);
+    }
+
+    const value_type& operator[]( size_type index ) const {
+        return const_cast<segment_table*>(this)->internal_subscript<true>(index);
+    }
+
+    const segment_table_allocator_type& get_allocator() const {
+        return my_segment_table_allocator;
+    }
+
+    segment_table_allocator_type& get_allocator() {
+        return my_segment_table_allocator;
+    }
+
+    void enable_segment( segment_type& segment, segment_table_type table, segment_index_type seg_index, size_type index ) {
+        // Allocate new segment
+        segment_type new_segment = self()->create_segment(table, seg_index, index);
+        if (new_segment != nullptr) {
+            // Store (new_segment - segment_base) into the segment table to allow access to the table by index via
+            // my_segment_table[segment_index_of(index)][index]
+            segment_type disabled_segment = nullptr;
+            if (!table[seg_index].compare_exchange_strong(disabled_segment, new_segment - segment_base(seg_index))) {
+                // compare_exchange failed => some other thread has already enabled this segment
+                // Deallocate the memory
+                self()->deallocate_segment(new_segment, seg_index);
+            }
+        }
+
+        segment = table[seg_index].load(std::memory_order_acquire);
+        __TBB_ASSERT(segment != nullptr, "If create_segment returned nullptr, the element should be stored in the table");
+    }
+
+    void delete_segment( segment_index_type seg_index ) {
+        segment_type disabled_segment = nullptr;
+        // Set the pointer to the segment to NULL in the table
+        segment_type segment_to_delete = get_table()[seg_index].exchange(disabled_segment);
+        if (segment_to_delete == segment_allocation_failure_tag) {
+            return;
+        }
+
+        segment_to_delete += segment_base(seg_index);
+
+        // Deallocate the segment
+        self()->destroy_segment(segment_to_delete, seg_index);
+    }
+
+    size_type number_of_segments( segment_table_type table ) const {
+        // Check for an active table, if it is embedded table - return the number of embedded segments
+        // Otherwise - return the maximum number of segments
+        return table == my_embedded_table ? pointers_per_embedded_table : pointers_per_long_table;
+    }
+
+    size_type capacity() const noexcept {
+        segment_table_type table = get_table();
+        size_type num_segments = number_of_segments(table);
+        for (size_type seg_index = 0; seg_index < num_segments; ++seg_index) {
+            // Check if the pointer is valid (allocated)
+            if (table[seg_index].load(std::memory_order_relaxed) <= segment_allocation_failure_tag) {
+                return segment_base(seg_index);
+            }
+        }
+        return segment_base(num_segments);
+    }
+
+    size_type find_last_allocated_segment( segment_table_type table ) const noexcept {
+        size_type end = 0;
+        size_type num_segments = number_of_segments(table);
+        for (size_type seg_index = 0; seg_index < num_segments; ++seg_index) {
+            // Check if the pointer is valid (allocated)
+            if (table[seg_index].load(std::memory_order_relaxed) > segment_allocation_failure_tag) {
+                end = seg_index + 1;
+            }
+        }
+        return end;
+    }
+
+    void reserve( size_type n ) {
+        if (n > allocator_traits_type::max_size(my_segment_table_allocator)) {
+            throw_exception(exception_id::reservation_length_error);
+        }
+
+        size_type size = my_size.load(std::memory_order_relaxed);
+        segment_index_type start_seg_idx = size == 0 ? 0 : segment_index_of(size - 1) + 1;
+        for (segment_index_type seg_idx = start_seg_idx; segment_base(seg_idx) < n; ++seg_idx) {
+                size_type first_index = segment_base(seg_idx);
+                internal_subscript<true>(first_index);
+        }
+    }
+
+    void clear() {
+        clear_segments();
+        clear_table();
+        my_size.store(0, std::memory_order_relaxed);
+        my_first_block.store(0, std::memory_order_relaxed);
+    }
+
+    void clear_segments() {
+        segment_table_type current_segment_table = get_table();
+        for (size_type i = number_of_segments(current_segment_table); i != 0; --i) {
+            if (current_segment_table[i - 1].load(std::memory_order_relaxed) != nullptr) {
+                // If the segment was enabled - disable and deallocate it
+                delete_segment(i - 1);
+            }
+        }
+    }
+
+    void clear_table() {
+        segment_table_type current_segment_table = get_table();
+        if (current_segment_table != my_embedded_table) {
+            // If the active table is not the embedded one - deallocate the active table
+            for (size_type i = 0; i != pointers_per_long_table; ++i) {
+                segment_table_allocator_traits::destroy(my_segment_table_allocator, &current_segment_table[i]);
+            }
+
+            segment_table_allocator_traits::deallocate(my_segment_table_allocator, current_segment_table, pointers_per_long_table);
+            my_segment_table.store(my_embedded_table, std::memory_order_relaxed);
+            zero_table(my_embedded_table, pointers_per_embedded_table);
+        }
+    }
+
+    void extend_table_if_necessary(segment_table_type& table, size_type start_index, size_type end_index) {
+        // extend_segment_table if an active table is an embedded table
+        // and the requested index is not in the embedded table
+        if (table == my_embedded_table && end_index > embedded_table_size) {
+            if (start_index <= embedded_table_size) {
+                try_call([&] {
+                    table = self()->allocate_long_table(my_embedded_table, start_index);
+                    // It is possible that the table was extended by the thread that allocated first_block.
+                    // In this case it is necessary to re-read the current table.
+
+                    if (table) {
+                        my_segment_table.store(table, std::memory_order_release);
+                    } else {
+                        table = my_segment_table.load(std::memory_order_acquire);
+                    }
+                }).on_exception([&] {
+                    my_segment_table_allocation_failed.store(true, std::memory_order_relaxed);
+                });
+            } else {
+                atomic_backoff backoff;
+                do {
+                    if (my_segment_table_allocation_failed.load(std::memory_order_relaxed)) {
+                        throw_exception(exception_id::bad_alloc);
+                    }
+                    backoff.pause();
+                    table = my_segment_table.load(std::memory_order_acquire); 
+                } while (table == my_embedded_table);
+            }
+        }
+    }
+
+    // Return the segment where index is stored
+    static constexpr segment_index_type segment_index_of( size_type index ) {
+        return size_type(tbb::detail::log2(uintptr_t(index|1)));
+    }
+
+    // Needed to calculate the offset in segment
+    static constexpr size_type segment_base( size_type index ) {
+        return size_type(1) << index & ~size_type(1);
+    }
+
+    // Return size of the segment
+    static constexpr size_type segment_size( size_type index ) {
+        return index == 0 ? 2 : size_type(1) << index;
+    }
+
+private:
+
+    derived_type* self() {
+        return static_cast<derived_type*>(this);
+    }
+
+    struct copy_segment_body_type {
+        void operator()( segment_index_type index, segment_type from, segment_type to ) const {
+            my_instance.self()->copy_segment(index, from, to);
+        }
+        segment_table& my_instance;
+    };
+
+    struct move_segment_body_type {
+        void operator()( segment_index_type index, segment_type from, segment_type to ) const {
+            my_instance.self()->move_segment(index, from, to);
+        }
+        segment_table& my_instance;
+    };
+
+    // Transgers all segments from the other table
+    template <typename TransferBody>
+    void internal_transfer( const segment_table& other, TransferBody transfer_segment ) {
+        static_cast<derived_type*>(this)->destroy_elements();
+
+        assign_first_block_if_necessary(other.my_first_block.load(std::memory_order_relaxed));
+        my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+
+        segment_table_type other_table = other.get_table();
+        size_type end_segment_size = segment_size(other.find_last_allocated_segment(other_table));
+
+        // If an exception occurred in other, then the size may be greater than the size of the end segment.
+        size_type other_size = end_segment_size < other.my_size.load(std::memory_order_relaxed) ?
+            other.my_size.load(std::memory_order_relaxed) : end_segment_size;
+        other_size = my_segment_table_allocation_failed ? embedded_table_size : other_size;
+
+        for (segment_index_type i = 0; segment_base(i) < other_size; ++i) {
+            // If the segment in other table is enabled - transfer it
+            if (other_table[i].load(std::memory_order_relaxed) == segment_allocation_failure_tag)
+            {
+                    my_size = segment_base(i);
+                    break;
+            } else if (other_table[i].load(std::memory_order_relaxed) != nullptr) {
+                internal_subscript<true>(segment_base(i));
+                transfer_segment(i, other.get_table()[i].load(std::memory_order_relaxed) + segment_base(i),
+                                get_table()[i].load(std::memory_order_relaxed) + segment_base(i));
+            }
+        }
+    }
+
+    // Moves the other segment table
+    // Only equal allocators are allowed
+    void internal_move( segment_table&& other ) {
+        // NOTE: allocators should be equal
+        clear();
+        my_first_block.store(other.my_first_block.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        // If an active table in other is embedded - restore all of the embedded segments
+        if (other.get_table() == other.my_embedded_table) {
+            for ( size_type i = 0; i != pointers_per_embedded_table; ++i ) {
+                segment_type other_segment = other.my_embedded_table[i].load(std::memory_order_relaxed);
+                my_embedded_table[i].store(other_segment, std::memory_order_relaxed);
+                other.my_embedded_table[i].store(nullptr, std::memory_order_relaxed);
+            }
+            my_segment_table.store(my_embedded_table, std::memory_order_relaxed);
+        } else {
+            my_segment_table.store(other.my_segment_table, std::memory_order_relaxed);
+            other.my_segment_table.store(other.my_embedded_table, std::memory_order_relaxed);
+            zero_table(other.my_embedded_table, pointers_per_embedded_table);
+        }
+        other.my_size.store(0, std::memory_order_relaxed);
+    }
+
+    // Move construct the segment table with the allocator object
+    // if any instances of allocator_type are always equal
+    void internal_move_construct_with_allocator( segment_table&& other, const allocator_type&,
+                                                 /*is_always_equal = */ std::true_type ) {
+        internal_move(std::move(other));
+    }
+
+    // Move construct the segment table with the allocator object
+    // if any instances of allocator_type are always equal
+    void internal_move_construct_with_allocator( segment_table&& other, const allocator_type& alloc,
+                                                 /*is_always_equal = */ std::false_type ) {
+        if (other.my_segment_table_allocator == alloc) {
+            // If allocators are equal - restore pointers
+            internal_move(std::move(other));
+        } else {
+            // If allocators are not equal - perform per element move with reallocation
+            try_call( [&] {
+                internal_transfer(other, move_segment_body_type{*this});
+            } ).on_exception( [&] {
+                clear();
+            });
+        }
+    }
+
+    // Move assigns the segment table to other is any instances of allocator_type are always equal
+    // or propagate_on_container_move_assignment is true
+    void internal_move_assign( segment_table&& other, /*is_always_equal || POCMA = */ std::true_type ) {
+        internal_move(std::move(other));
+    }
+
+    // Move assigns the segment table to other is any instances of allocator_type are not always equal
+    // and propagate_on_container_move_assignment is false
+    void internal_move_assign( segment_table&& other, /*is_always_equal || POCMA = */ std::false_type ) {
+        if (my_segment_table_allocator == other.my_segment_table_allocator) {
+            // If allocators are equal - restore pointers
+            internal_move(std::move(other));
+        } else {
+            // If allocators are not equal - perform per element move with reallocation
+            internal_transfer(other, move_segment_body_type{*this});
+        }
+    }
+
+    // Swaps two segment tables if any instances of allocator_type are always equal
+    // or propagate_on_container_swap is true
+    void internal_swap( segment_table& other, /*is_always_equal || POCS = */ std::true_type ) {
+        internal_swap_fields(other);
+    }
+
+    // Swaps two segment tables if any instances of allocator_type are not always equal
+    // and propagate_on_container_swap is false
+    // According to the C++ standard, swapping of two containers with unequal allocators
+    // is an undefined behavior scenario
+    void internal_swap( segment_table& other, /*is_always_equal || POCS = */ std::false_type ) {
+        __TBB_ASSERT(my_segment_table_allocator == other.my_segment_table_allocator,
+                     "Swapping with unequal allocators is not allowed");
+        internal_swap_fields(other);
+    }
+
+    void internal_swap_fields( segment_table& other ) {
+        // If an active table in either *this segment table or other is an embedded one - swaps the embedded tables
+        if (get_table() == my_embedded_table ||
+            other.get_table() == other.my_embedded_table) {
+
+            for (size_type i = 0; i != pointers_per_embedded_table; ++i) {
+                segment_type current_segment = my_embedded_table[i].load(std::memory_order_relaxed);
+                segment_type other_segment = other.my_embedded_table[i].load(std::memory_order_relaxed);
+
+                my_embedded_table[i].store(other_segment, std::memory_order_relaxed);
+                other.my_embedded_table[i].store(current_segment, std::memory_order_relaxed);
+            }
+        }
+
+        segment_table_type current_segment_table = get_table();
+        segment_table_type other_segment_table = other.get_table();
+
+        // If an active table is an embedded one -
+        // store an active table in other to the embedded one from other
+        if (current_segment_table == my_embedded_table) {
+            other.my_segment_table.store(other.my_embedded_table, std::memory_order_relaxed);
+        } else {
+            // Otherwise - store it to the active segment table
+            other.my_segment_table.store(current_segment_table, std::memory_order_relaxed);
+        }
+
+        // If an active table in other segment table is an embedded one -
+        // store an active table in other to the embedded one from *this
+        if (other_segment_table == other.my_embedded_table) {
+            my_segment_table.store(my_embedded_table, std::memory_order_relaxed);
+        } else {
+            // Otherwise - store it to the active segment table in other
+            my_segment_table.store(other_segment_table, std::memory_order_relaxed);
+        }
+        auto first_block = other.my_first_block.load(std::memory_order_relaxed);
+        other.my_first_block.store(my_first_block.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        my_first_block.store(first_block, std::memory_order_relaxed);
+
+        auto size = other.my_size.load(std::memory_order_relaxed);
+        other.my_size.store(my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        my_size.store(size, std::memory_order_relaxed);
+    }
+
+protected:
+    // A flag indicates that an exception was throws during segment allocations
+    const segment_type segment_allocation_failure_tag = reinterpret_cast<segment_type>(1);
+    static constexpr size_type embedded_table_size = segment_size(pointers_per_embedded_table);
+
+    template <bool allow_out_of_range_access>
+    value_type& internal_subscript( size_type index ) {
+        segment_index_type seg_index = segment_index_of(index);
+        segment_table_type table = my_segment_table.load(std::memory_order_acquire);
+        segment_type segment = nullptr;
+
+        if (allow_out_of_range_access) {
+            if (derived_type::allow_table_extending) {
+                extend_table_if_necessary(table, index, index + 1);
+            }
+
+            segment = table[seg_index].load(std::memory_order_acquire);
+            // If the required segment is disabled - enable it
+            if (segment == nullptr) {
+                enable_segment(segment, table, seg_index, index);
+            }
+            // Check if an exception was thrown during segment allocation
+            if (segment == segment_allocation_failure_tag) {
+                throw_exception(exception_id::bad_alloc);
+            }
+        } else {
+            segment = table[seg_index].load(std::memory_order_acquire);
+        }
+        __TBB_ASSERT(segment != nullptr, nullptr);
+
+        return segment[index];
+    }
+
+    void assign_first_block_if_necessary(segment_index_type index) {
+        size_type zero = 0;
+        if (this->my_first_block.load(std::memory_order_relaxed) == zero) {
+            this->my_first_block.compare_exchange_strong(zero, index);
+        }
+    }
+
+    void zero_table( segment_table_type table, size_type count ) {
+        for (size_type i = 0; i != count; ++i) {
+            table[i].store(nullptr, std::memory_order_relaxed);
+        }
+    }
+
+    segment_table_type get_table() const {
+        return my_segment_table.load(std::memory_order_acquire);
+    }
+
+    segment_table_allocator_type my_segment_table_allocator;
+    std::atomic<segment_table_type> my_segment_table;
+    atomic_segment my_embedded_table[pointers_per_embedded_table];
+    // Number of segments in first block
+    std::atomic<size_type> my_first_block;
+    // Number of elements in table
+    std::atomic<size_type> my_size;
+    // Flag to indicate failed extend table
+    std::atomic<bool> my_segment_table_allocation_failed;
+}; // class segment_table
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#pragma warning(pop) // warning 4127 is back
+#endif
+
+#endif // __TBB_detail__segment_table_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_small_object_pool.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_small_object_pool.h
new file mode 100644
index 0000000000..8a10a61e1a
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_small_object_pool.h
@@ -0,0 +1,108 @@
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__small_object_pool_H
+#define __TBB__small_object_pool_H
+
+#include "_config.h"
+#include "_assert.h"
+
+#include "../profiling.h"
+#include <cstddef>
+#include <cstdint>
+#include <atomic>
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+class small_object_pool {
+protected:
+    small_object_pool() = default;
+};
+struct execution_data;
+}
+
+namespace r1 {
+void* __TBB_EXPORTED_FUNC allocate(d1::small_object_pool*& pool, std::size_t number_of_bytes,
+                                    const d1::execution_data& ed);
+void* __TBB_EXPORTED_FUNC allocate(d1::small_object_pool*& pool, std::size_t number_of_bytes);
+void  __TBB_EXPORTED_FUNC deallocate(d1::small_object_pool& pool, void* ptr, std::size_t number_of_bytes,
+                                        const d1::execution_data& ed);
+void  __TBB_EXPORTED_FUNC deallocate(d1::small_object_pool& pool, void* ptr, std::size_t number_of_bytes);
+}
+
+namespace d1 {
+class small_object_allocator {
+public:
+    template <typename Type, typename... Args>
+    Type* new_object(execution_data& ed, Args&&... args) {
+        void* allocated_object = r1::allocate(m_pool, sizeof(Type), ed);
+
+        auto constructed_object = new(allocated_object) Type(std::forward<Args>(args)...);
+        return constructed_object;
+    }
+
+    template <typename Type, typename... Args>
+    Type* new_object(Args&&... args) {
+        void* allocated_object = r1::allocate(m_pool, sizeof(Type));
+
+        auto constructed_object = new(allocated_object) Type(std::forward<Args>(args)...);
+        return constructed_object;
+    }
+
+    template <typename Type>
+    void delete_object(Type* object, const execution_data& ed) {
+        // Copy this since the it can be the member of the passed object and
+        // unintentionally destroyed when Type destructor is called below
+        small_object_allocator alloc = *this;
+        object->~Type();
+        alloc.deallocate(object, ed);
+    }
+
+    template <typename Type>
+    void delete_object(Type* object) {
+        // Copy this since the it can be the member of the passed object and
+        // unintentionally destroyed when Type destructor is called below
+        small_object_allocator alloc = *this;
+        object->~Type();
+        alloc.deallocate(object);
+    }
+
+    template <typename Type>
+    void deallocate(Type* ptr, const execution_data& ed) {
+        call_itt_task_notify(destroy, ptr);
+
+        __TBB_ASSERT(m_pool != nullptr, "Pool must be valid for deallocate call");
+        r1::deallocate(*m_pool, ptr, sizeof(Type), ed);
+    }
+
+    template <typename Type>
+    void deallocate(Type* ptr) {
+        call_itt_task_notify(destroy, ptr);
+
+        __TBB_ASSERT(m_pool != nullptr, "Pool must be valid for deallocate call");
+        r1::deallocate(*m_pool, ptr, sizeof(Type));
+    }
+private:
+    small_object_pool* m_pool{};
+};
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB__small_object_pool_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_string_resource.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_string_resource.h
new file mode 100644
index 0000000000..c06d5b5db0
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_string_resource.h
@@ -0,0 +1,78 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+TBB_STRING_RESOURCE(ALGORITHM, "tbb_algorithm")
+TBB_STRING_RESOURCE(PARALLEL_FOR, "tbb_parallel_for")
+TBB_STRING_RESOURCE(PARALLEL_FOR_EACH, "tbb_parallel_for_each")
+TBB_STRING_RESOURCE(PARALLEL_INVOKE, "tbb_parallel_invoke")
+TBB_STRING_RESOURCE(PARALLEL_REDUCE, "tbb_parallel_reduce")
+TBB_STRING_RESOURCE(PARALLEL_SCAN, "tbb_parallel_scan")
+TBB_STRING_RESOURCE(PARALLEL_SORT, "tbb_parallel_sort")
+TBB_STRING_RESOURCE(PARALLEL_PIPELINE, "tbb_parallel_pipeline")
+TBB_STRING_RESOURCE(CUSTOM_CTX, "tbb_custom")
+
+TBB_STRING_RESOURCE(FLOW_NULL, "null")
+TBB_STRING_RESOURCE(FLOW_BROADCAST_NODE, "broadcast_node")
+TBB_STRING_RESOURCE(FLOW_BUFFER_NODE, "buffer_node")
+TBB_STRING_RESOURCE(FLOW_CONTINUE_NODE, "continue_node")
+TBB_STRING_RESOURCE(FLOW_FUNCTION_NODE, "function_node")
+TBB_STRING_RESOURCE(FLOW_JOIN_NODE_QUEUEING, "join_node (queueing)")
+TBB_STRING_RESOURCE(FLOW_JOIN_NODE_RESERVING, "join_node (reserving)")
+TBB_STRING_RESOURCE(FLOW_JOIN_NODE_TAG_MATCHING, "join_node (tag_matching)")
+TBB_STRING_RESOURCE(FLOW_LIMITER_NODE, "limiter_node")
+TBB_STRING_RESOURCE(FLOW_MULTIFUNCTION_NODE, "multifunction_node")
+TBB_STRING_RESOURCE(FLOW_OVERWRITE_NODE, "overwrite_node")
+TBB_STRING_RESOURCE(FLOW_PRIORITY_QUEUE_NODE, "priority_queue_node")
+TBB_STRING_RESOURCE(FLOW_QUEUE_NODE, "queue_node")
+TBB_STRING_RESOURCE(FLOW_SEQUENCER_NODE, "sequencer_node")
+TBB_STRING_RESOURCE(FLOW_INPUT_NODE, "input_node")
+TBB_STRING_RESOURCE(FLOW_SPLIT_NODE, "split_node")
+TBB_STRING_RESOURCE(FLOW_WRITE_ONCE_NODE, "write_once_node")
+TBB_STRING_RESOURCE(FLOW_INDEXER_NODE, "indexer_node")
+TBB_STRING_RESOURCE(FLOW_COMPOSITE_NODE, "composite_node")
+TBB_STRING_RESOURCE(FLOW_ASYNC_NODE, "async_node")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT, "input_port")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_0, "input_port_0")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_1, "input_port_1")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_2, "input_port_2")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_3, "input_port_3")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_4, "input_port_4")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_5, "input_port_5")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_6, "input_port_6")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_7, "input_port_7")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_8, "input_port_8")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_9, "input_port_9")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT, "output_port")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_0, "output_port_0")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_1, "output_port_1")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_2, "output_port_2")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_3, "output_port_3")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_4, "output_port_4")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_5, "output_port_5")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_6, "output_port_6")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_7, "output_port_7")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_8, "output_port_8")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_9, "output_port_9")
+TBB_STRING_RESOURCE(FLOW_OBJECT_NAME, "object_name")
+TBB_STRING_RESOURCE(FLOW_BODY, "body")
+TBB_STRING_RESOURCE(FLOW_GRAPH, "graph")
+TBB_STRING_RESOURCE(FLOW_NODE, "node")
+TBB_STRING_RESOURCE(FLOW_TASKS, "tbb_flow_graph")
+TBB_STRING_RESOURCE(USER_EVENT, "user_event")
+
+#if __TBB_FLOW_TRACE_CODEPTR
+TBB_STRING_RESOURCE(CODE_ADDRESS, "code_address")
+#endif
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_task.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_task.h
new file mode 100644
index 0000000000..7b4f8521c6
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_task.h
@@ -0,0 +1,243 @@
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__task_H
+#define __TBB__task_H
+
+#include "_config.h"
+#include "_assert.h"
+#include "_template_helpers.h"
+#include "_small_object_pool.h"
+
+#include "../profiling.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <climits>
+#include <utility>
+#include <atomic>
+#include <mutex>
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+using slot_id = unsigned short;
+constexpr slot_id no_slot = slot_id(~0);
+constexpr slot_id any_slot = slot_id(~1);
+
+class task;
+class wait_context;
+class task_group_context;
+struct execution_data;
+}
+
+namespace r1 {
+//! Task spawn/wait entry points
+void __TBB_EXPORTED_FUNC spawn(d1::task& t, d1::task_group_context& ctx);
+void __TBB_EXPORTED_FUNC spawn(d1::task& t, d1::task_group_context& ctx, d1::slot_id id);
+void __TBB_EXPORTED_FUNC execute_and_wait(d1::task& t, d1::task_group_context& t_ctx, d1::wait_context&, d1::task_group_context& w_ctx);
+void __TBB_EXPORTED_FUNC wait(d1::wait_context&, d1::task_group_context& ctx);
+d1::slot_id __TBB_EXPORTED_FUNC execution_slot(const d1::execution_data*);
+d1::task_group_context* __TBB_EXPORTED_FUNC current_context();
+
+// Do not place under __TBB_RESUMABLE_TASKS. It is a stub for unsupported platforms.
+struct suspend_point_type;
+using suspend_callback_type = void(*)(void*, suspend_point_type*);
+//! The resumable tasks entry points
+void __TBB_EXPORTED_FUNC suspend(suspend_callback_type suspend_callback, void* user_callback);
+void __TBB_EXPORTED_FUNC resume(suspend_point_type* tag);
+suspend_point_type* __TBB_EXPORTED_FUNC current_suspend_point();
+void __TBB_EXPORTED_FUNC notify_waiters(std::uintptr_t wait_ctx_addr);
+
+class thread_data;
+class task_dispatcher;
+class external_waiter;
+struct task_accessor;
+struct task_arena_impl;
+} // namespace r1
+
+namespace d1 {
+
+class task_arena;
+using suspend_point = r1::suspend_point_type*;
+
+#if __TBB_RESUMABLE_TASKS
+template <typename F>
+static void suspend_callback(void* user_callback, suspend_point sp) {
+    // Copy user function to a new stack after the context switch to avoid a race when the previous
+    // suspend point is resumed while the user_callback is being called.
+    F user_callback_copy = *static_cast<F*>(user_callback);
+    user_callback_copy(sp);
+}
+
+template <typename F>
+void suspend(F f) {
+    r1::suspend(&suspend_callback<F>, &f);
+}
+
+inline void resume(suspend_point tag) {
+    r1::resume(tag);
+}
+#endif /* __TBB_RESUMABLE_TASKS */
+
+// TODO align wait_context on cache lane
+class wait_context {
+    static constexpr std::uint64_t overflow_mask = ~((1LLU << 32) - 1);
+
+    std::uint64_t m_version_and_traits{1};
+    std::atomic<std::uint64_t> m_ref_count{};
+
+    void add_reference(std::int64_t delta) {
+        call_itt_task_notify(releasing, this);
+        std::uint64_t r = m_ref_count.fetch_add(delta) + delta;
+
+        __TBB_ASSERT_EX((r & overflow_mask) == 0, "Overflow is detected");
+
+        if (!r) {
+            // Some external waiters or coroutine waiters sleep in wait list
+            // Should to notify them that work is done
+            std::uintptr_t wait_ctx_addr = std::uintptr_t(this);
+            r1::notify_waiters(wait_ctx_addr);
+        }
+    }
+
+    bool continue_execution() const {
+        std::uint64_t r = m_ref_count.load(std::memory_order_acquire);
+        __TBB_ASSERT_EX((r & overflow_mask) == 0, "Overflow is detected");
+        return r > 0;
+    }
+
+    friend class r1::thread_data;
+    friend class r1::task_dispatcher;
+    friend class r1::external_waiter;
+    friend class task_group;
+    friend class task_group_base;
+    friend struct r1::task_arena_impl;
+    friend struct r1::suspend_point_type;
+public:
+    // Despite the internal reference count is uin64_t we limit the user interface with uint32_t
+    // to preserve a part of the internal reference count for special needs.
+    wait_context(std::uint32_t ref_count) : m_ref_count{ref_count} { suppress_unused_warning(m_version_and_traits); }
+    wait_context(const wait_context&) = delete;
+
+    ~wait_context() {
+        __TBB_ASSERT(!continue_execution(), NULL);
+    }
+
+    void reserve(std::uint32_t delta = 1) {
+        add_reference(delta);
+    }
+
+    void release(std::uint32_t delta = 1) {
+        add_reference(-std::int64_t(delta));
+    }
+#if __TBB_EXTRA_DEBUG
+    unsigned reference_count() const {
+        return unsigned(m_ref_count.load(std::memory_order_acquire));
+    }
+#endif
+};
+
+struct execution_data {
+    task_group_context* context{};
+    slot_id original_slot{};
+    slot_id affinity_slot{};
+};
+
+inline task_group_context* context(const execution_data& ed) {
+    return ed.context;
+}
+
+inline slot_id original_slot(const execution_data& ed) {
+    return ed.original_slot;
+}
+
+inline slot_id affinity_slot(const execution_data& ed) {
+    return ed.affinity_slot;
+}
+
+inline slot_id execution_slot(const execution_data& ed) {
+    return r1::execution_slot(&ed);
+}
+
+inline bool is_same_affinity(const execution_data& ed) {
+    return affinity_slot(ed) == no_slot || affinity_slot(ed) == execution_slot(ed);
+}
+
+inline bool is_stolen(const execution_data& ed) {
+    return original_slot(ed) != execution_slot(ed);
+}
+
+inline void spawn(task& t, task_group_context& ctx) {
+    call_itt_task_notify(releasing, &t);
+    r1::spawn(t, ctx);
+}
+
+inline void spawn(task& t, task_group_context& ctx, slot_id id) {
+    call_itt_task_notify(releasing, &t);
+    r1::spawn(t, ctx, id);
+}
+
+inline void execute_and_wait(task& t, task_group_context& t_ctx, wait_context& wait_ctx, task_group_context& w_ctx) {
+    r1::execute_and_wait(t, t_ctx, wait_ctx, w_ctx);
+    call_itt_task_notify(acquired, &wait_ctx);
+    call_itt_task_notify(destroy, &wait_ctx);
+}
+
+inline void wait(wait_context& wait_ctx, task_group_context& ctx) {
+    r1::wait(wait_ctx, ctx);
+    call_itt_task_notify(acquired, &wait_ctx);
+    call_itt_task_notify(destroy, &wait_ctx);
+}
+
+using r1::current_context;
+
+class task_traits {
+    std::uint64_t m_version_and_traits{};
+    friend struct r1::task_accessor;
+};
+
+//! Alignment for a task object
+static constexpr std::size_t task_alignment = 64;
+
+//! Base class for user-defined tasks.
+/** @ingroup task_scheduling */
+
+class
+#if __TBB_ALIGNAS_AVAILABLE
+alignas(task_alignment)
+#endif
+task : public task_traits {
+protected:
+    virtual ~task() = default;
+
+public:
+    virtual task* execute(execution_data&) = 0;
+    virtual task* cancel(execution_data&) = 0;
+
+private:
+    std::uint64_t m_reserved[6]{};
+    friend struct r1::task_accessor;
+};
+#if __TBB_ALIGNAS_AVAILABLE
+static_assert(sizeof(task) == task_alignment, "task size is broken");
+#endif
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB__task_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_template_helpers.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_template_helpers.h
new file mode 100644
index 0000000000..45a8ffede6
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_template_helpers.h
@@ -0,0 +1,394 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__template_helpers_H
+#define __TBB_detail__template_helpers_H
+
+#include "_utils.h"
+#include "_config.h"
+
+#include <cstddef>
+#include <cstdint>
+
+#include <type_traits>
+#include <memory>
+#include <iterator>
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+
+// An internal implementation of void_t, which can be used in SFINAE contexts
+template <typename...>
+struct void_impl {
+    using type = void;
+}; // struct void_impl
+
+template <typename... Args>
+using void_t = typename void_impl<Args...>::type;
+
+// Generic SFINAE helper for expression checks, based on the idea demonstrated in ISO C++ paper n4502
+template <typename T, typename, template <typename> class... Checks>
+struct supports_impl {
+    using type = std::false_type;
+};
+
+template <typename T, template <typename> class... Checks>
+struct supports_impl<T, void_t<Checks<T>...>, Checks...> {
+    using type = std::true_type;
+};
+
+template <typename T, template <typename> class... Checks>
+using supports = typename supports_impl<T, void, Checks...>::type;
+
+//! A template to select either 32-bit or 64-bit constant as compile time, depending on machine word size.
+template <unsigned u, unsigned long long ull >
+struct select_size_t_constant {
+    // Explicit cast is needed to avoid compiler warnings about possible truncation.
+    // The value of the right size,   which is selected by ?:, is anyway not truncated or promoted.
+    static const std::size_t value = (std::size_t)((sizeof(std::size_t)==sizeof(u)) ? u : ull);
+};
+
+// TODO: do we really need it?
+//! Cast between unrelated pointer types.
+/** This method should be used sparingly as a last resort for dealing with
+  situations that inherently break strict ISO C++ aliasing rules. */
+// T is a pointer type because it will be explicitly provided by the programmer as a template argument;
+// U is a referent type to enable the compiler to check that "ptr" is a pointer, deducing U in the process.
+template<typename T, typename U>
+inline T punned_cast( U* ptr ) {
+    std::uintptr_t x = reinterpret_cast<std::uintptr_t>(ptr);
+    return reinterpret_cast<T>(x);
+}
+
+template<class T, size_t S, size_t R>
+struct padded_base : T {
+    char pad[S - R];
+};
+template<class T, size_t S> struct padded_base<T, S, 0> : T {};
+
+//! Pads type T to fill out to a multiple of cache line size.
+template<class T, size_t S = max_nfs_size>
+struct padded : padded_base<T, S, sizeof(T) % S> {};
+
+#if __TBB_CPP14_INTEGER_SEQUENCE_PRESENT
+
+using std::index_sequence;
+using std::make_index_sequence;
+
+#else
+
+template<std::size_t... S> class index_sequence {};
+
+template<std::size_t N, std::size_t... S>
+struct make_index_sequence_impl : make_index_sequence_impl < N - 1, N - 1, S... > {};
+
+template<std::size_t... S>
+struct make_index_sequence_impl <0, S...> {
+    using type = index_sequence<S...>;
+};
+
+template<std::size_t N>
+using make_index_sequence = typename make_index_sequence_impl<N>::type;
+
+#endif /* __TBB_CPP14_INTEGER_SEQUENCE_PRESENT */
+
+#if __TBB_CPP17_LOGICAL_OPERATIONS_PRESENT
+using std::conjunction;
+using std::disjunction;
+#else // __TBB_CPP17_LOGICAL_OPERATIONS_PRESENT
+
+template <typename...>
+struct conjunction : std::true_type {};
+
+template <typename First, typename... Args>
+struct conjunction<First, Args...>
+    : std::conditional<bool(First::value), conjunction<Args...>, First>::type {};
+
+template <typename T>
+struct conjunction<T> : T {};
+
+template <typename...>
+struct disjunction : std::false_type {};
+
+template <typename First, typename... Args>
+struct disjunction<First, Args...>
+    : std::conditional<bool(First::value), First, disjunction<Args...>>::type {};
+
+template <typename T>
+struct disjunction<T> : T {};
+
+#endif // __TBB_CPP17_LOGICAL_OPERATIONS_PRESENT
+
+template <typename Iterator>
+using iterator_value_t = typename std::iterator_traits<Iterator>::value_type;
+
+template <typename Iterator>
+using iterator_key_t = typename std::remove_const<typename iterator_value_t<Iterator>::first_type>::type;
+
+template <typename Iterator>
+using iterator_mapped_t = typename iterator_value_t<Iterator>::second_type;
+
+template <typename Iterator>
+using iterator_alloc_pair_t = std::pair<typename std::add_const<iterator_key_t<Iterator>>::type,
+                                        iterator_mapped_t<Iterator>>;
+
+template <typename A> using alloc_value_type = typename A::value_type;
+template <typename A> using alloc_ptr_t = typename std::allocator_traits<A>::pointer;
+template <typename A> using has_allocate = decltype(std::declval<alloc_ptr_t<A>&>() = std::declval<A>().allocate(0));
+template <typename A> using has_deallocate = decltype(std::declval<A>().deallocate(std::declval<alloc_ptr_t<A>>(), 0));
+
+// alloc_value_type should be checked first, because it can be used in other checks
+template <typename T>
+using is_allocator = supports<T, alloc_value_type, has_allocate, has_deallocate>;
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+template <typename T>
+inline constexpr bool is_allocator_v = is_allocator<T>::value;
+#endif
+
+// Template class in which the "type" determines the type of the element number N in pack Args
+template <std::size_t N, typename... Args>
+struct pack_element {
+    using type = void;
+};
+
+template <std::size_t N, typename T, typename... Args>
+struct pack_element<N, T, Args...> {
+    using type = typename pack_element<N-1, Args...>::type;
+};
+
+template <typename T, typename... Args>
+struct pack_element<0, T, Args...> {
+    using type = T;
+};
+
+template <std::size_t N, typename... Args>
+using pack_element_t = typename pack_element<N, Args...>::type;
+
+template <typename Func>
+class raii_guard {
+public:
+    raii_guard( Func f ) : my_func(f), is_active(true) {}
+
+    ~raii_guard() {
+        if (is_active) {
+            my_func();
+        }
+    }
+
+    void dismiss() {
+        is_active = false;
+    }
+private:
+    Func my_func;
+    bool is_active;
+}; // class raii_guard
+
+template <typename Func>
+raii_guard<Func> make_raii_guard( Func f ) {
+    return raii_guard<Func>(f);
+}
+
+template <typename Body>
+struct try_call_proxy {
+    try_call_proxy( Body b ) : body(b) {}
+
+    template <typename OnExceptionBody>
+    void on_exception( OnExceptionBody on_exception_body ) {
+        auto guard = make_raii_guard(on_exception_body);
+        body();
+        guard.dismiss();
+    }
+
+    template <typename OnCompletionBody>
+    void on_completion(OnCompletionBody on_completion_body) {
+        auto guard = make_raii_guard(on_completion_body);
+        body();
+    }
+
+    Body body;
+}; // struct try_call_proxy
+
+// Template helper function for API
+// try_call(lambda1).on_exception(lambda2)
+// Executes lambda1 and if it throws an exception - executes lambda2
+template <typename Body>
+try_call_proxy<Body> try_call( Body b ) {
+    return try_call_proxy<Body>(b);
+}
+
+#if __TBB_CPP17_IS_SWAPPABLE_PRESENT
+using std::is_nothrow_swappable;
+using std::is_swappable;
+#else // __TBB_CPP17_IS_SWAPPABLE_PRESENT
+namespace is_swappable_detail {
+using std::swap;
+
+template <typename T>
+using has_swap = decltype(swap(std::declval<T&>(), std::declval<T&>()));
+
+#if _MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER
+// Workaround for VS2015: it fails to instantiate noexcept(...) inside std::integral_constant.
+template <typename T>
+struct noexcept_wrapper {
+    static const bool value = noexcept(swap(std::declval<T&>(), std::declval<T&>()));
+};
+template <typename T>
+struct is_nothrow_swappable_impl : std::integral_constant<bool, noexcept_wrapper<T>::value> {};
+#else
+template <typename T>
+struct is_nothrow_swappable_impl : std::integral_constant<bool, noexcept(swap(std::declval<T&>(), std::declval<T&>()))> {};
+#endif
+}
+
+template <typename T>
+struct is_swappable : supports<T, is_swappable_detail::has_swap> {};
+
+template <typename T>
+struct is_nothrow_swappable
+    : conjunction<is_swappable<T>, is_swappable_detail::is_nothrow_swappable_impl<T>> {};
+#endif // __TBB_CPP17_IS_SWAPPABLE_PRESENT
+
+//! Allows to store a function parameter pack as a variable and later pass it to another function
+template< typename... Types >
+struct stored_pack;
+
+template<>
+struct stored_pack<>
+{
+    using pack_type = stored_pack<>;
+    stored_pack() {}
+
+    // Friend front-end functions
+    template< typename F, typename Pack > friend void call(F&& f, Pack&& p);
+    template< typename Ret, typename F, typename Pack > friend Ret call_and_return(F&& f, Pack&& p);
+
+protected:
+    // Ideally, ref-qualified non-static methods would be used,
+    // but that would greatly reduce the set of compilers where it works.
+    template< typename Ret, typename F, typename... Preceding >
+    static Ret call(F&& f, const pack_type& /*pack*/, Preceding&&... params) {
+        return std::forward<F>(f)(std::forward<Preceding>(params)...);
+    }
+    template< typename Ret, typename F, typename... Preceding >
+    static Ret call(F&& f, pack_type&& /*pack*/, Preceding&&... params) {
+        return std::forward<F>(f)(std::forward<Preceding>(params)...);
+    }
+};
+
+template< typename T, typename... Types >
+struct stored_pack<T, Types...> : stored_pack<Types...>
+{
+    using pack_type = stored_pack<T, Types...>;
+    using pack_remainder = stored_pack<Types...>;
+
+    // Since lifetime of original values is out of control, copies should be made.
+    // Thus references should be stripped away from the deduced type.
+    typename std::decay<T>::type leftmost_value;
+
+    // Here rvalue references act in the same way as forwarding references,
+    // as long as class template parameters were deduced via forwarding references.
+    stored_pack(T&& t, Types&&... types)
+    : pack_remainder(std::forward<Types>(types)...), leftmost_value(std::forward<T>(t)) {}
+
+    // Friend front-end functions
+    template< typename F, typename Pack > friend void call(F&& f, Pack&& p);
+    template< typename Ret, typename F, typename Pack > friend Ret call_and_return(F&& f, Pack&& p);
+
+protected:
+    template< typename Ret, typename F, typename... Preceding >
+    static Ret call(F&& f, pack_type& pack, Preceding&&... params) {
+        return pack_remainder::template call<Ret>(
+            std::forward<F>(f), static_cast<pack_remainder&>(pack),
+            std::forward<Preceding>(params)... , pack.leftmost_value
+        );
+    }
+
+    template< typename Ret, typename F, typename... Preceding >
+    static Ret call(F&& f, pack_type&& pack, Preceding&&... params) {
+        return pack_remainder::template call<Ret>(
+            std::forward<F>(f), static_cast<pack_remainder&&>(pack),
+            std::forward<Preceding>(params)... , std::move(pack.leftmost_value)
+        );
+    }
+};
+
+//! Calls the given function with arguments taken from a stored_pack
+template< typename F, typename Pack >
+void call(F&& f, Pack&& p) {
+    std::decay<Pack>::type::template call<void>(std::forward<F>(f), std::forward<Pack>(p));
+}
+
+template< typename Ret, typename F, typename Pack >
+Ret call_and_return(F&& f, Pack&& p) {
+    return std::decay<Pack>::type::template call<Ret>(std::forward<F>(f), std::forward<Pack>(p));
+}
+
+template< typename... Types >
+stored_pack<Types...> save_pack(Types&&... types) {
+    return stored_pack<Types...>(std::forward<Types>(types)...);
+}
+
+// A structure with the value which is equal to Trait::value
+// but can be used in the immediate context due to parameter T
+template <typename Trait, typename T>
+struct dependent_bool : std::integral_constant<bool, bool(Trait::value)> {};
+
+template <typename Callable>
+struct body_arg_detector;
+
+template <typename Callable, typename ReturnType, typename Arg>
+struct body_arg_detector<ReturnType(Callable::*)(Arg)> {
+    using arg_type = Arg;
+};
+
+template <typename Callable, typename ReturnType, typename Arg>
+struct body_arg_detector<ReturnType(Callable::*)(Arg) const> {
+    using arg_type = Arg;
+};
+
+template <typename Callable>
+struct argument_detector;
+
+template <typename Callable>
+struct argument_detector {
+    using type = typename body_arg_detector<decltype(&Callable::operator())>::arg_type;
+};
+
+template <typename ReturnType, typename Arg>
+struct argument_detector<ReturnType(*)(Arg)> {
+    using type = Arg;
+};
+
+// Detects the argument type of callable, works for callable with one argument.
+template <typename Callable>
+using argument_type_of = typename argument_detector<typename std::decay<Callable>::type>::type;
+
+template <typename T>
+struct type_identity {
+    using type = T;
+};
+
+template <typename T>
+using type_identity_t = typename type_identity<T>::type;
+
+} // inline namespace d0
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__template_helpers_H
+
diff --git a/contrib/libs/tbb/include/oneapi/tbb/detail/_utils.h b/contrib/libs/tbb/include/oneapi/tbb/detail/_utils.h
new file mode 100644
index 0000000000..d1e02179f8
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/detail/_utils.h
@@ -0,0 +1,329 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__utils_H
+#define __TBB_detail__utils_H
+
+#include <type_traits>
+#include <cstdint>
+#include <atomic>
+
+#include "_config.h"
+#include "_assert.h"
+#include "_machine.h"
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+
+//! Utility template function to prevent "unused" warnings by various compilers.
+template<typename... T> void suppress_unused_warning(T&&...) {}
+
+//! Compile-time constant that is upper bound on cache line/sector size.
+/** It should be used only in situations where having a compile-time upper
+  bound is more useful than a run-time exact answer.
+  @ingroup memory_allocation */
+constexpr size_t max_nfs_size = 128;
+
+//! Class that implements exponential backoff.
+class atomic_backoff {
+    //! Time delay, in units of "pause" instructions.
+    /** Should be equal to approximately the number of "pause" instructions
+      that take the same time as an context switch. Must be a power of two.*/
+    static constexpr std::int32_t LOOPS_BEFORE_YIELD = 16;
+    std::int32_t count;
+
+public:
+    // In many cases, an object of this type is initialized eagerly on hot path,
+    // as in for(atomic_backoff b; ; b.pause()) { /*loop body*/ }
+    // For this reason, the construction cost must be very small!
+    atomic_backoff() : count(1) {}
+    // This constructor pauses immediately; do not use on hot paths!
+    atomic_backoff(bool) : count(1) { pause(); }
+
+    //! No Copy
+    atomic_backoff(const atomic_backoff&) = delete;
+    atomic_backoff& operator=(const atomic_backoff&) = delete;
+
+    //! Pause for a while.
+    void pause() {
+        if (count <= LOOPS_BEFORE_YIELD) {
+            machine_pause(count);
+            // Pause twice as long the next time.
+            count *= 2;
+        } else {
+            // Pause is so long that we might as well yield CPU to scheduler.
+            yield();
+        }
+    }
+
+    //! Pause for a few times and return false if saturated.
+    bool bounded_pause() {
+        machine_pause(count);
+        if (count < LOOPS_BEFORE_YIELD) {
+            // Pause twice as long the next time.
+            count *= 2;
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    void reset() {
+        count = 1;
+    }
+};
+
+//! Spin WHILE the condition is true.
+/** T and U should be comparable types. */
+template <typename T, typename C>
+void spin_wait_while_condition(const std::atomic<T>& location, C comp) {
+    atomic_backoff backoff;
+    while (comp(location.load(std::memory_order_acquire))) {
+        backoff.pause();
+    }
+}
+
+//! Spin WHILE the value of the variable is equal to a given value
+/** T and U should be comparable types. */
+template <typename T, typename U>
+void spin_wait_while_eq(const std::atomic<T>& location, const U value) {
+    spin_wait_while_condition(location, [&value](T t) { return t == value; });
+}
+
+//! Spin UNTIL the value of the variable is equal to a given value
+/** T and U should be comparable types. */
+template<typename T, typename U>
+void spin_wait_until_eq(const std::atomic<T>& location, const U value) {
+    spin_wait_while_condition(location, [&value](T t) { return t != value; });
+}
+
+template <typename T>
+std::uintptr_t log2(T in) {
+    __TBB_ASSERT(in > 0, "The logarithm of a non-positive value is undefined.");
+    return machine_log2(in);
+}
+
+template<typename T>
+T reverse_bits(T src) {
+    return machine_reverse_bits(src);
+}
+
+template<typename T>
+T reverse_n_bits(T src, std::size_t n) {
+    __TBB_ASSERT(n != 0, "Reverse for 0 bits is undefined behavior.");
+    return reverse_bits(src) >> (number_of_bits<T>() - n);
+}
+
+// A function to check if passed integer is a power of two
+template <typename IntegerType>
+constexpr bool is_power_of_two( IntegerType arg ) {
+    static_assert(std::is_integral<IntegerType>::value,
+                  "An argument for is_power_of_two should be integral type");
+    return arg && (0 == (arg & (arg - 1)));
+}
+
+// A function to determine if passed integer is a power of two
+// at least as big as another power of two, i.e. for strictly positive i and j,
+// with j being a power of two, determines whether i==j<<k for some nonnegative k
+template <typename ArgIntegerType, typename DivisorIntegerType>
+constexpr bool is_power_of_two_at_least(ArgIntegerType arg, DivisorIntegerType divisor) {
+    // Divisor should be a power of two
+    static_assert(std::is_integral<ArgIntegerType>::value,
+                  "An argument for is_power_of_two_at_least should be integral type");
+    return 0 == (arg & (arg - divisor));
+}
+
+// A function to compute arg modulo divisor where divisor is a power of 2.
+template<typename ArgIntegerType, typename DivisorIntegerType>
+inline ArgIntegerType modulo_power_of_two(ArgIntegerType arg, DivisorIntegerType divisor) {
+    __TBB_ASSERT( is_power_of_two(divisor), "Divisor should be a power of two" );
+    return arg & (divisor - 1);
+}
+
+//! A function to check if passed in pointer is aligned on a specific border
+template<typename T>
+constexpr bool is_aligned(T* pointer, std::uintptr_t alignment) {
+    return 0 == ((std::uintptr_t)pointer & (alignment - 1));
+}
+
+#if TBB_USE_ASSERT
+static void* const poisoned_ptr = reinterpret_cast<void*>(-1);
+
+//! Set p to invalid pointer value.
+template<typename T>
+inline void poison_pointer( T* &p ) { p = reinterpret_cast<T*>(poisoned_ptr); }
+
+template<typename T>
+inline void poison_pointer(std::atomic<T*>& p) { p.store(reinterpret_cast<T*>(poisoned_ptr), std::memory_order_relaxed); }
+
+/** Expected to be used in assertions only, thus no empty form is defined. **/
+template<typename T>
+inline bool is_poisoned( T* p ) { return p == reinterpret_cast<T*>(poisoned_ptr); }
+
+template<typename T>
+inline bool is_poisoned(const std::atomic<T*>& p) { return is_poisoned(p.load(std::memory_order_relaxed)); }
+#else
+template<typename T>
+inline void poison_pointer(T* &) {/*do nothing*/}
+
+template<typename T>
+inline void poison_pointer(std::atomic<T*>&) { /* do nothing */}
+#endif /* !TBB_USE_ASSERT */
+
+template <std::size_t alignment = 0, typename T>
+bool assert_pointer_valid(T* p, const char* comment = nullptr) {
+    suppress_unused_warning(p, comment);
+    __TBB_ASSERT(p != nullptr, comment);
+    __TBB_ASSERT(!is_poisoned(p), comment);
+#if !(_MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER)
+    __TBB_ASSERT(is_aligned(p, alignment == 0 ? alignof(T) : alignment), comment);
+#endif
+    // Returns something to simplify assert_pointers_valid implementation.
+    return true;
+}
+
+template <typename... Args>
+void assert_pointers_valid(Args*... p) {
+    // suppress_unused_warning is used as an evaluation context for the variadic pack.
+    suppress_unused_warning(assert_pointer_valid(p)...);
+}
+
+//! Base class for types that should not be assigned.
+class no_assign {
+public:
+    void operator=(const no_assign&) = delete;
+    no_assign(const no_assign&) = default;
+    no_assign() = default;
+};
+
+//! Base class for types that should not be copied or assigned.
+class no_copy: no_assign {
+public:
+    no_copy(const no_copy&) = delete;
+    no_copy() = default;
+};
+
+template <typename T>
+void swap_atomics_relaxed(std::atomic<T>& lhs, std::atomic<T>& rhs){
+    T tmp = lhs.load(std::memory_order_relaxed);
+    lhs.store(rhs.load(std::memory_order_relaxed), std::memory_order_relaxed);
+    rhs.store(tmp, std::memory_order_relaxed);
+}
+
+//! One-time initialization states
+enum class do_once_state {
+    uninitialized = 0,      ///< No execution attempts have been undertaken yet
+    pending,                ///< A thread is executing associated do-once routine
+    executed,               ///< Do-once routine has been executed
+    initialized = executed  ///< Convenience alias
+};
+
+//! One-time initialization function
+/** /param initializer Pointer to function without arguments
+           The variant that returns bool is used for cases when initialization can fail
+           and it is OK to continue execution, but the state should be reset so that
+           the initialization attempt was repeated the next time.
+    /param state Shared state associated with initializer that specifies its
+            initialization state. Must be initially set to #uninitialized value
+            (e.g. by means of default static zero initialization). **/
+template <typename F>
+void atomic_do_once( const F& initializer, std::atomic<do_once_state>& state ) {
+    // The loop in the implementation is necessary to avoid race when thread T2
+    // that arrived in the middle of initialization attempt by another thread T1
+    // has just made initialization possible.
+    // In such a case T2 has to rely on T1 to initialize, but T1 may already be past
+    // the point where it can recognize the changed conditions.
+    do_once_state expected_state;
+    while ( state.load( std::memory_order_acquire ) != do_once_state::executed ) {
+        if( state.load( std::memory_order_relaxed ) == do_once_state::uninitialized ) {
+            expected_state = do_once_state::uninitialized;
+#if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1910
+            using enum_type = typename std::underlying_type<do_once_state>::type;
+            if( ((std::atomic<enum_type>&)state).compare_exchange_strong( (enum_type&)expected_state, (enum_type)do_once_state::pending ) ) {
+#else
+            if( state.compare_exchange_strong( expected_state, do_once_state::pending ) ) {
+#endif
+                run_initializer( initializer, state );
+                break;
+            }
+        }
+        spin_wait_while_eq( state, do_once_state::pending );
+    }
+}
+
+// Run the initializer which can not fail
+template<typename Functor>
+void run_initializer(const Functor& f, std::atomic<do_once_state>& state ) {
+    f();
+    state.store(do_once_state::executed, std::memory_order_release);
+}
+
+#if __TBB_CPP20_CONCEPTS_PRESENT
+template <typename T>
+concept boolean_testable_impl = std::convertible_to<T, bool>;
+
+template <typename T>
+concept boolean_testable = boolean_testable_impl<T> && requires( T&& t ) {
+                               { !std::forward<T>(t) } -> boolean_testable_impl;
+                           };
+
+#if __TBB_CPP20_COMPARISONS_PRESENT
+struct synthesized_three_way_comparator {
+    template <typename T1, typename T2>
+    auto operator()( const T1& lhs, const T2& rhs ) const
+        requires requires {
+            { lhs < rhs } -> boolean_testable;
+            { rhs < lhs } -> boolean_testable;
+        }
+    {
+        if constexpr (std::three_way_comparable_with<T1, T2>) {
+            return lhs <=> rhs;
+        } else {
+            if (lhs < rhs) {
+                return std::weak_ordering::less;
+            }
+            if (rhs < lhs) {
+                return std::weak_ordering::greater;
+            }
+            return std::weak_ordering::equivalent;
+        }
+    }
+}; // struct synthesized_three_way_comparator
+
+template <typename T1, typename T2 = T1>
+using synthesized_three_way_result = decltype(synthesized_three_way_comparator{}(std::declval<T1&>(),
+                                                                                 std::declval<T2&>()));
+
+#endif // __TBB_CPP20_COMPARISONS_PRESENT
+#endif // __TBB_CPP20_CONCEPTS_PRESENT
+
+} // namespace d0
+
+namespace d1 {
+
+class delegate_base {
+public:
+    virtual bool operator()() const = 0;
+    virtual ~delegate_base() {}
+}; // class delegate_base
+
+}  // namespace d1
+
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__utils_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/enumerable_thread_specific.h b/contrib/libs/tbb/include/oneapi/tbb/enumerable_thread_specific.h
new file mode 100644
index 0000000000..246447a213
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/enumerable_thread_specific.h
@@ -0,0 +1,1113 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_enumerable_thread_specific_H
+#define __TBB_enumerable_thread_specific_H
+
+#include "detail/_config.h"
+#include "detail/_namespace_injection.h"
+#include "detail/_assert.h"
+#include "detail/_template_helpers.h"
+#include "detail/_aligned_space.h"
+
+#include "concurrent_vector.h"
+#include "tbb_allocator.h"
+#include "cache_aligned_allocator.h"
+#include "profiling.h"
+
+#include <atomic>
+#include <thread>
+#include <cstring> // memcpy
+#include <cstddef> // std::ptrdiff_t
+
+#include "task.h" // for task::suspend_point
+
+#if _WIN32 || _WIN64
+#include <windows.h>
+#else
+#include <pthread.h>
+#endif
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! enum for selecting between single key and key-per-instance versions
+enum ets_key_usage_type {
+    ets_key_per_instance
+    , ets_no_key
+#if __TBB_RESUMABLE_TASKS
+    , ets_suspend_aware
+#endif
+};
+
+// Forward declaration to use in internal classes
+template <typename T, typename Allocator, ets_key_usage_type ETS_key_type>
+class enumerable_thread_specific;
+
+template <std::size_t ThreadIDSize>
+struct internal_ets_key_selector {
+    using key_type = std::thread::id;
+    static key_type current_key() {
+        return std::this_thread::get_id();
+    }
+};
+
+// Intel Compiler on OSX cannot create atomics objects that instantiated from non-fundamental types
+#if __INTEL_COMPILER && __APPLE__
+template<>
+struct internal_ets_key_selector<sizeof(std::size_t)> {
+    using key_type = std::size_t;
+    static key_type current_key() {
+        auto id = std::this_thread::get_id();
+        return reinterpret_cast<key_type&>(id);
+    }
+};
+#endif
+
+template <ets_key_usage_type ETS_key_type>
+struct ets_key_selector : internal_ets_key_selector<sizeof(std::thread::id)> {};
+
+#if __TBB_RESUMABLE_TASKS
+template <>
+struct ets_key_selector<ets_suspend_aware> {
+    using key_type = suspend_point;
+    static key_type current_key() {
+        return r1::current_suspend_point();
+    }
+};
+#endif
+
+template<ets_key_usage_type ETS_key_type>
+class ets_base : detail::no_copy {
+protected:
+    using key_type = typename ets_key_selector<ETS_key_type>::key_type;
+
+public:
+    struct slot;
+    struct array {
+        array* next;
+        std::size_t lg_size;
+        slot& at( std::size_t k ) {
+            return (reinterpret_cast<slot*>(reinterpret_cast<void*>(this+1)))[k];
+        }
+        std::size_t size() const { return std::size_t(1) << lg_size; }
+        std::size_t mask() const { return size() - 1; }
+        std::size_t start( std::size_t h ) const {
+            return h >> (8 * sizeof(std::size_t) - lg_size);
+        }
+    };
+    struct slot {
+        std::atomic<key_type> key;
+        void* ptr;
+        bool empty() const { return key.load(std::memory_order_relaxed) == key_type(); }
+        bool match( key_type k ) const { return key.load(std::memory_order_relaxed) == k; }
+        bool claim( key_type k ) {
+            // TODO: maybe claim ptr, because key_type is not guaranteed to fit into word size
+            key_type expected = key_type();
+            return key.compare_exchange_strong(expected, k);
+        }
+    };
+
+protected:
+    //! Root of linked list of arrays of decreasing size.
+    /** nullptr if and only if my_count==0.
+        Each array in the list is half the size of its predecessor. */
+    std::atomic<array*> my_root;
+    std::atomic<std::size_t> my_count;
+
+    virtual void* create_local() = 0;
+    virtual void* create_array(std::size_t _size) = 0;  // _size in bytes
+    virtual void free_array(void* ptr, std::size_t _size) = 0; // _size in bytes
+
+    array* allocate( std::size_t lg_size ) {
+        std::size_t n = std::size_t(1) << lg_size;
+        array* a = static_cast<array*>(create_array(sizeof(array) + n * sizeof(slot)));
+        a->lg_size = lg_size;
+        std::memset( a + 1, 0, n * sizeof(slot) );
+        return a;
+    }
+    void free(array* a) {
+        std::size_t n = std::size_t(1) << (a->lg_size);
+        free_array( static_cast<void*>(a), std::size_t(sizeof(array) + n * sizeof(slot)) );
+    }
+
+    ets_base() : my_root{nullptr}, my_count{0} {}
+    virtual ~ets_base();  // g++ complains if this is not virtual
+
+    void* table_lookup( bool& exists );
+    void  table_clear();
+    // The following functions are not used in concurrent context,
+    // so we don't need synchronization and ITT annotations there.
+    template <ets_key_usage_type E2>
+    void table_elementwise_copy( const ets_base& other,
+                                 void*(*add_element)(ets_base<E2>&, void*) ) {
+        __TBB_ASSERT(!my_root.load(std::memory_order_relaxed),NULL);
+        __TBB_ASSERT(!my_count.load(std::memory_order_relaxed),NULL);
+        if( !other.my_root.load(std::memory_order_relaxed) ) return;
+        array* root = allocate(other.my_root.load(std::memory_order_relaxed)->lg_size);
+        my_root.store(root, std::memory_order_relaxed);
+        root->next = nullptr;
+        my_count.store(other.my_count.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        std::size_t mask = root->mask();
+        for( array* r = other.my_root.load(std::memory_order_relaxed); r; r = r->next ) {
+            for( std::size_t i = 0; i < r->size(); ++i ) {
+                slot& s1 = r->at(i);
+                if( !s1.empty() ) {
+                    for( std::size_t j = root->start(std::hash<key_type>{}(s1.key.load(std::memory_order_relaxed))); ; j = (j+1)&mask ) {
+                        slot& s2 = root->at(j);
+                        if( s2.empty() ) {
+                            s2.ptr = add_element(static_cast<ets_base<E2>&>(*this), s1.ptr);
+                            s2.key.store(s1.key.load(std::memory_order_relaxed), std::memory_order_relaxed);
+                            break;
+                        }
+                        else if( s2.match(s1.key.load(std::memory_order_relaxed)) )
+                            break;
+                    }
+                }
+            }
+        }
+    }
+    void table_swap( ets_base& other ) {
+       __TBB_ASSERT(this!=&other, "Don't swap an instance with itself");
+       swap_atomics_relaxed(my_root, other.my_root);
+       swap_atomics_relaxed(my_count, other.my_count);
+    }
+};
+
+template<ets_key_usage_type ETS_key_type>
+ets_base<ETS_key_type>::~ets_base() {
+    __TBB_ASSERT(!my_root.load(std::memory_order_relaxed), nullptr);
+}
+
+template<ets_key_usage_type ETS_key_type>
+void ets_base<ETS_key_type>::table_clear() {
+    while ( array* r = my_root.load(std::memory_order_relaxed) ) {
+        my_root.store(r->next, std::memory_order_relaxed);
+        free(r);
+    }
+    my_count.store(0, std::memory_order_relaxed);
+}
+
+template<ets_key_usage_type ETS_key_type>
+void* ets_base<ETS_key_type>::table_lookup( bool& exists ) {
+    const key_type k = ets_key_selector<ETS_key_type>::current_key();
+
+    __TBB_ASSERT(k != key_type(),NULL);
+    void* found;
+    std::size_t h = std::hash<key_type>{}(k);
+    for( array* r = my_root.load(std::memory_order_acquire); r; r = r->next ) {
+        call_itt_notify(acquired,r);
+        std::size_t mask=r->mask();
+        for(std::size_t i = r->start(h); ;i=(i+1)&mask) {
+            slot& s = r->at(i);
+            if( s.empty() ) break;
+            if( s.match(k) ) {
+                if( r == my_root.load(std::memory_order_acquire) ) {
+                    // Success at top level
+                    exists = true;
+                    return s.ptr;
+                } else {
+                    // Success at some other level.  Need to insert at top level.
+                    exists = true;
+                    found = s.ptr;
+                    goto insert;
+                }
+            }
+        }
+    }
+    // Key does not yet exist.  The density of slots in the table does not exceed 0.5,
+    // for if this will occur a new table is allocated with double the current table
+    // size, which is swapped in as the new root table.  So an empty slot is guaranteed.
+    exists = false;
+    found = create_local();
+    {
+        std::size_t c = ++my_count;
+        array* r = my_root.load(std::memory_order_acquire);
+        call_itt_notify(acquired,r);
+        if( !r || c > r->size()/2 ) {
+            std::size_t s = r ? r->lg_size : 2;
+            while( c > std::size_t(1)<<(s-1) ) ++s;
+            array* a = allocate(s);
+            for(;;) {
+                a->next = r;
+                call_itt_notify(releasing,a);
+                array* new_r = r;
+                if( my_root.compare_exchange_strong(new_r, a) ) break;
+                call_itt_notify(acquired, new_r);
+                __TBB_ASSERT(new_r != nullptr, nullptr);
+                if( new_r->lg_size >= s ) {
+                    // Another thread inserted an equal or  bigger array, so our array is superfluous.
+                    free(a);
+                    break;
+                }
+                r = new_r;
+            }
+        }
+    }
+    insert:
+    // Whether a slot has been found in an older table, or if it has been inserted at this level,
+    // it has already been accounted for in the total.  Guaranteed to be room for it, and it is
+    // not present, so search for empty slot and use it.
+    array* ir = my_root.load(std::memory_order_acquire);
+    call_itt_notify(acquired, ir);
+    std::size_t mask = ir->mask();
+    for(std::size_t i = ir->start(h);; i = (i+1)&mask) {
+        slot& s = ir->at(i);
+        if( s.empty() ) {
+            if( s.claim(k) ) {
+                s.ptr = found;
+                return found;
+            }
+        }
+    }
+}
+
+//! Specialization that exploits native TLS
+template <>
+class ets_base<ets_key_per_instance>: public ets_base<ets_no_key> {
+    using super = ets_base<ets_no_key>;
+#if _WIN32||_WIN64
+#if __TBB_WIN8UI_SUPPORT
+    using tls_key_t = DWORD;
+    void create_key() { my_key = FlsAlloc(NULL); }
+    void destroy_key() { FlsFree(my_key); }
+    void set_tls(void * value) { FlsSetValue(my_key, (LPVOID)value); }
+    void* get_tls() { return (void *)FlsGetValue(my_key); }
+#else
+    using tls_key_t = DWORD;
+    void create_key() { my_key = TlsAlloc(); }
+    void destroy_key() { TlsFree(my_key); }
+    void set_tls(void * value) { TlsSetValue(my_key, (LPVOID)value); }
+    void* get_tls() { return (void *)TlsGetValue(my_key); }
+#endif
+#else
+    using tls_key_t = pthread_key_t;
+    void create_key() { pthread_key_create(&my_key, NULL); }
+    void destroy_key() { pthread_key_delete(my_key); }
+    void set_tls( void * value ) const { pthread_setspecific(my_key, value); }
+    void* get_tls() const { return pthread_getspecific(my_key); }
+#endif
+    tls_key_t my_key;
+    virtual void* create_local() override = 0;
+    virtual void* create_array(std::size_t _size) override = 0;  // _size in bytes
+    virtual void free_array(void* ptr, std::size_t _size) override = 0; // size in bytes
+protected:
+    ets_base() {create_key();}
+    ~ets_base() {destroy_key();}
+    void* table_lookup( bool& exists ) {
+        void* found = get_tls();
+        if( found ) {
+            exists=true;
+        } else {
+            found = super::table_lookup(exists);
+            set_tls(found);
+        }
+        return found;
+    }
+    void table_clear() {
+        destroy_key();
+        create_key();
+        super::table_clear();
+    }
+    void table_swap( ets_base& other ) {
+       using std::swap;
+       __TBB_ASSERT(this!=&other, "Don't swap an instance with itself");
+       swap(my_key, other.my_key);
+       super::table_swap(other);
+    }
+};
+
+//! Random access iterator for traversing the thread local copies.
+template< typename Container, typename Value >
+class enumerable_thread_specific_iterator
+{
+    //! current position in the concurrent_vector
+
+    Container *my_container;
+    typename Container::size_type my_index;
+    mutable Value *my_value;
+
+    template<typename C, typename T, typename U>
+    friend bool operator==( const enumerable_thread_specific_iterator<C, T>& i,
+                     const enumerable_thread_specific_iterator<C, U>& j );
+
+    template<typename C, typename T, typename U>
+    friend bool operator<( const enumerable_thread_specific_iterator<C,T>& i,
+                           const enumerable_thread_specific_iterator<C,U>& j );
+
+    template<typename C, typename T, typename U>
+    friend std::ptrdiff_t operator-( const enumerable_thread_specific_iterator<C,T>& i,
+                                const enumerable_thread_specific_iterator<C,U>& j );
+
+    template<typename C, typename U>
+    friend class enumerable_thread_specific_iterator;
+
+public:
+    //! STL support
+    using difference_type = std::ptrdiff_t;
+    using value_type = Value;
+    using pointer = Value*;
+    using reference = Value&;
+    using iterator_category = std::random_access_iterator_tag;
+
+    enumerable_thread_specific_iterator( const Container &container, typename Container::size_type index ) :
+        my_container(&const_cast<Container &>(container)), my_index(index), my_value(nullptr) {}
+
+    //! Default constructor
+    enumerable_thread_specific_iterator() : my_container(nullptr), my_index(0), my_value(nullptr) {}
+
+    template<typename U>
+    enumerable_thread_specific_iterator( const enumerable_thread_specific_iterator<Container, U>& other ) :
+            my_container( other.my_container ), my_index( other.my_index), my_value( const_cast<Value *>(other.my_value) ) {}
+
+    enumerable_thread_specific_iterator operator+( std::ptrdiff_t offset ) const {
+        return enumerable_thread_specific_iterator(*my_container, my_index + offset);
+    }
+
+    friend enumerable_thread_specific_iterator operator+( std::ptrdiff_t offset, enumerable_thread_specific_iterator v ) {
+        return enumerable_thread_specific_iterator(*v.my_container, v.my_index + offset);
+    }
+
+    enumerable_thread_specific_iterator &operator+=( std::ptrdiff_t offset ) {
+        my_index += offset;
+        my_value = nullptr;
+        return *this;
+    }
+
+    enumerable_thread_specific_iterator operator-( std::ptrdiff_t offset ) const {
+        return enumerable_thread_specific_iterator( *my_container, my_index-offset );
+    }
+
+    enumerable_thread_specific_iterator &operator-=( std::ptrdiff_t offset ) {
+        my_index -= offset;
+        my_value = nullptr;
+        return *this;
+    }
+
+    Value& operator*() const {
+        Value* value = my_value;
+        if( !value ) {
+            value = my_value = (*my_container)[my_index].value();
+        }
+        __TBB_ASSERT( value==(*my_container)[my_index].value(), "corrupt cache" );
+        return *value;
+    }
+
+    Value& operator[]( std::ptrdiff_t k ) const {
+       return *(*my_container)[my_index + k].value();
+    }
+
+    Value* operator->() const {return &operator*();}
+
+    enumerable_thread_specific_iterator& operator++() {
+        ++my_index;
+        my_value = nullptr;
+        return *this;
+    }
+
+    enumerable_thread_specific_iterator& operator--() {
+        --my_index;
+        my_value = nullptr;
+        return *this;
+    }
+
+    //! Post increment
+    enumerable_thread_specific_iterator operator++(int) {
+        enumerable_thread_specific_iterator result = *this;
+        ++my_index;
+        my_value = nullptr;
+        return result;
+    }
+
+    //! Post decrement
+    enumerable_thread_specific_iterator operator--(int) {
+        enumerable_thread_specific_iterator result = *this;
+        --my_index;
+        my_value = nullptr;
+        return result;
+    }
+};
+
+template<typename Container, typename T, typename U>
+bool operator==( const enumerable_thread_specific_iterator<Container, T>& i,
+                 const enumerable_thread_specific_iterator<Container, U>& j ) {
+    return i.my_index == j.my_index && i.my_container == j.my_container;
+}
+
+template<typename Container, typename T, typename U>
+bool operator!=( const enumerable_thread_specific_iterator<Container,T>& i,
+                 const enumerable_thread_specific_iterator<Container,U>& j ) {
+    return !(i==j);
+}
+
+template<typename Container, typename T, typename U>
+bool operator<( const enumerable_thread_specific_iterator<Container,T>& i,
+                const enumerable_thread_specific_iterator<Container,U>& j ) {
+    return i.my_index<j.my_index;
+}
+
+template<typename Container, typename T, typename U>
+bool operator>( const enumerable_thread_specific_iterator<Container,T>& i,
+                const enumerable_thread_specific_iterator<Container,U>& j ) {
+    return j<i;
+}
+
+template<typename Container, typename T, typename U>
+bool operator>=( const enumerable_thread_specific_iterator<Container,T>& i,
+                 const enumerable_thread_specific_iterator<Container,U>& j ) {
+    return !(i<j);
+}
+
+template<typename Container, typename T, typename U>
+bool operator<=( const enumerable_thread_specific_iterator<Container,T>& i,
+                 const enumerable_thread_specific_iterator<Container,U>& j ) {
+    return !(j<i);
+}
+
+template<typename Container, typename T, typename U>
+std::ptrdiff_t operator-( const enumerable_thread_specific_iterator<Container,T>& i,
+                     const enumerable_thread_specific_iterator<Container,U>& j ) {
+    return i.my_index-j.my_index;
+}
+
+template<typename SegmentedContainer, typename Value >
+class segmented_iterator
+{
+    template<typename C, typename T, typename U>
+    friend bool operator==(const segmented_iterator<C,T>& i, const segmented_iterator<C,U>& j);
+
+    template<typename C, typename T, typename U>
+    friend bool operator!=(const segmented_iterator<C,T>& i, const segmented_iterator<C,U>& j);
+
+    template<typename C, typename U>
+    friend class segmented_iterator;
+
+public:
+    segmented_iterator() {my_segcont = nullptr;}
+
+    segmented_iterator( const SegmentedContainer& _segmented_container ) :
+        my_segcont(const_cast<SegmentedContainer*>(&_segmented_container)),
+        outer_iter(my_segcont->end()) { }
+
+    ~segmented_iterator() {}
+
+    using InnerContainer = typename SegmentedContainer::value_type;
+    using inner_iterator = typename InnerContainer::iterator;
+    using outer_iterator = typename SegmentedContainer::iterator;
+
+    // STL support
+    // TODO: inherit all types from segmented container?
+    using difference_type = std::ptrdiff_t;
+    using value_type = Value;
+    using size_type = typename SegmentedContainer::size_type;
+    using pointer = Value*;
+    using reference = Value&;
+    using iterator_category = std::input_iterator_tag;
+
+    // Copy Constructor
+    template<typename U>
+    segmented_iterator(const segmented_iterator<SegmentedContainer, U>& other) :
+        my_segcont(other.my_segcont),
+        outer_iter(other.outer_iter),
+        // can we assign a default-constructed iterator to inner if we're at the end?
+        inner_iter(other.inner_iter)
+    {}
+
+    // assignment
+    template<typename U>
+    segmented_iterator& operator=( const segmented_iterator<SegmentedContainer, U>& other) {
+        my_segcont = other.my_segcont;
+        outer_iter = other.outer_iter;
+        if(outer_iter != my_segcont->end()) inner_iter = other.inner_iter;
+        return *this;
+    }
+
+    // allow assignment of outer iterator to segmented iterator.  Once it is
+    // assigned, move forward until a non-empty inner container is found or
+    // the end of the outer container is reached.
+    segmented_iterator& operator=(const outer_iterator& new_outer_iter) {
+        __TBB_ASSERT(my_segcont != nullptr, NULL);
+        // check that this iterator points to something inside the segmented container
+        for(outer_iter = new_outer_iter ;outer_iter!=my_segcont->end(); ++outer_iter) {
+            if( !outer_iter->empty() ) {
+                inner_iter = outer_iter->begin();
+                break;
+            }
+        }
+        return *this;
+    }
+
+    // pre-increment
+    segmented_iterator& operator++() {
+        advance_me();
+        return *this;
+    }
+
+    // post-increment
+    segmented_iterator operator++(int) {
+        segmented_iterator tmp = *this;
+        operator++();
+        return tmp;
+    }
+
+    bool operator==(const outer_iterator& other_outer) const {
+        __TBB_ASSERT(my_segcont != nullptr, NULL);
+        return (outer_iter == other_outer &&
+                (outer_iter == my_segcont->end() || inner_iter == outer_iter->begin()));
+    }
+
+    bool operator!=(const outer_iterator& other_outer) const {
+        return !operator==(other_outer);
+
+    }
+
+    // (i)* RHS
+    reference operator*() const {
+        __TBB_ASSERT(my_segcont != nullptr, NULL);
+        __TBB_ASSERT(outer_iter != my_segcont->end(), "Dereferencing a pointer at end of container");
+        __TBB_ASSERT(inner_iter != outer_iter->end(), NULL); // should never happen
+        return *inner_iter;
+    }
+
+    // i->
+    pointer operator->() const { return &operator*();}
+
+private:
+    SegmentedContainer* my_segcont;
+    outer_iterator outer_iter;
+    inner_iterator inner_iter;
+
+    void advance_me() {
+        __TBB_ASSERT(my_segcont != nullptr, NULL);
+        __TBB_ASSERT(outer_iter != my_segcont->end(), NULL); // not true if there are no inner containers
+        __TBB_ASSERT(inner_iter != outer_iter->end(), NULL); // not true if the inner containers are all empty.
+        ++inner_iter;
+        while(inner_iter == outer_iter->end() && ++outer_iter != my_segcont->end()) {
+            inner_iter = outer_iter->begin();
+        }
+    }
+};    // segmented_iterator
+
+template<typename SegmentedContainer, typename T, typename U>
+bool operator==( const segmented_iterator<SegmentedContainer,T>& i,
+                 const segmented_iterator<SegmentedContainer,U>& j ) {
+    if(i.my_segcont != j.my_segcont) return false;
+    if(i.my_segcont == nullptr) return true;
+    if(i.outer_iter != j.outer_iter) return false;
+    if(i.outer_iter == i.my_segcont->end()) return true;
+    return i.inner_iter == j.inner_iter;
+}
+
+// !=
+template<typename SegmentedContainer, typename T, typename U>
+bool operator!=( const segmented_iterator<SegmentedContainer,T>& i,
+                 const segmented_iterator<SegmentedContainer,U>& j ) {
+    return !(i==j);
+}
+
+template<typename T>
+struct construct_by_default: no_assign {
+    void construct(void*where) {new(where) T();} // C++ note: the () in T() ensure zero initialization.
+    construct_by_default( int ) {}
+};
+
+template<typename T>
+struct construct_by_exemplar: no_assign {
+    const T exemplar;
+    void construct(void*where) {new(where) T(exemplar);}
+    construct_by_exemplar( const T& t ) : exemplar(t) {}
+    construct_by_exemplar( T&& t ) : exemplar(std::move(t)) {}
+};
+
+template<typename T, typename Finit>
+struct construct_by_finit: no_assign {
+    Finit f;
+    void construct(void* where) {new(where) T(f());}
+    construct_by_finit( Finit&& f_ ) : f(std::move(f_)) {}
+};
+
+template<typename T, typename... P>
+struct construct_by_args: no_assign {
+    stored_pack<P...> pack;
+    void construct(void* where) {
+        call( [where](const typename std::decay<P>::type&... args ){
+           new(where) T(args...);
+        }, pack );
+    }
+    construct_by_args( P&& ... args ) : pack(std::forward<P>(args)...) {}
+};
+
+// storage for initialization function pointer
+// TODO: consider removing the template parameter T here and in callback_leaf
+class callback_base {
+public:
+    // Clone *this
+    virtual callback_base* clone() const = 0;
+    // Destruct and free *this
+    virtual void destroy() = 0;
+    // Need virtual destructor to satisfy GCC compiler warning
+    virtual ~callback_base() { }
+    // Construct T at where
+    virtual void construct(void* where) = 0;
+};
+
+template <typename Constructor>
+class callback_leaf: public callback_base, Constructor {
+    template<typename... P> callback_leaf( P&& ... params ) : Constructor(std::forward<P>(params)...) {}
+    // TODO: make the construction/destruction consistent (use allocator.construct/destroy)
+    using my_allocator_type = typename tbb::tbb_allocator<callback_leaf>;
+
+    callback_base* clone() const override {
+        return make(*this);
+    }
+
+    void destroy() override {
+        my_allocator_type alloc;
+        tbb::detail::allocator_traits<my_allocator_type>::destroy(alloc, this);
+        tbb::detail::allocator_traits<my_allocator_type>::deallocate(alloc, this, 1);
+    }
+
+    void construct(void* where) override {
+        Constructor::construct(where);
+    }
+
+public:
+    template<typename... P>
+    static callback_base* make( P&& ... params ) {
+        void* where = my_allocator_type().allocate(1);
+        return new(where) callback_leaf( std::forward<P>(params)... );
+    }
+};
+
+//! Template for recording construction of objects in table
+/** All maintenance of the space will be done explicitly on push_back,
+    and all thread local copies must be destroyed before the concurrent
+    vector is deleted.
+
+    The flag is_built is initialized to false.  When the local is
+    successfully-constructed, set the flag to true or call value_committed().
+    If the constructor throws, the flag will be false.
+*/
+template<typename U>
+struct ets_element {
+    detail::aligned_space<U> my_space;
+    bool is_built;
+    ets_element() { is_built = false; }  // not currently-built
+    U* value() { return my_space.begin(); }
+    U* value_committed() { is_built = true; return my_space.begin(); }
+    ~ets_element() {
+        if(is_built) {
+            my_space.begin()->~U();
+            is_built = false;
+        }
+    }
+};
+
+// A predicate that can be used for a compile-time compatibility check of ETS instances
+// Ideally, it should have been declared inside the ETS class, but unfortunately
+// in that case VS2013 does not enable the variadic constructor.
+template<typename T, typename ETS> struct is_compatible_ets : std::false_type {};
+template<typename T, typename U, typename A, ets_key_usage_type C>
+struct is_compatible_ets< T, enumerable_thread_specific<U,A,C> > : std::is_same<T, U> {};
+
+// A predicate that checks whether, for a variable 'foo' of type T, foo() is a valid expression
+template <typename T> using has_empty_braces_operator = decltype(std::declval<T>()());
+template <typename T> using is_callable_no_args = supports<T, has_empty_braces_operator>;
+
+//! The enumerable_thread_specific container
+/** enumerable_thread_specific has the following properties:
+    - thread-local copies are lazily created, with default, exemplar or function initialization.
+    - thread-local copies do not move (during lifetime, and excepting clear()) so the address of a copy is invariant.
+    - the contained objects need not have operator=() defined if combine is not used.
+    - enumerable_thread_specific containers may be copy-constructed or assigned.
+    - thread-local copies can be managed by hash-table, or can be accessed via TLS storage for speed.
+    - outside of parallel contexts, the contents of all thread-local copies are accessible by iterator or using combine or combine_each methods
+
+@par Segmented iterator
+    When the thread-local objects are containers with input_iterators defined, a segmented iterator may
+    be used to iterate over all the elements of all thread-local copies.
+
+@par combine and combine_each
+    - Both methods are defined for enumerable_thread_specific.
+    - combine() requires the type T have operator=() defined.
+    - neither method modifies the contents of the object (though there is no guarantee that the applied methods do not modify the object.)
+    - Both are evaluated in serial context (the methods are assumed to be non-benign.)
+
+@ingroup containers */
+template <typename T, typename Allocator=cache_aligned_allocator<T>,
+          ets_key_usage_type ETS_key_type=ets_no_key >
+class enumerable_thread_specific: ets_base<ETS_key_type> {
+
+    template<typename U, typename A, ets_key_usage_type C> friend class enumerable_thread_specific;
+
+    using padded_element = padded<ets_element<T>>;
+
+    //! A generic range, used to create range objects from the iterators
+    template<typename I>
+    class generic_range_type: public blocked_range<I> {
+    public:
+        using value_type = T;
+        using reference = T&;
+        using const_reference = const T&;
+        using iterator = I;
+        using difference_type = std::ptrdiff_t;
+
+        generic_range_type( I begin_, I end_, std::size_t grainsize_ = 1) : blocked_range<I>(begin_,end_,grainsize_) {}
+        template<typename U>
+        generic_range_type( const generic_range_type<U>& r) : blocked_range<I>(r.begin(),r.end(),r.grainsize()) {}
+        generic_range_type( generic_range_type& r, split ) : blocked_range<I>(r,split()) {}
+    };
+
+    using allocator_traits_type = tbb::detail::allocator_traits<Allocator>;
+
+    using padded_allocator_type = typename allocator_traits_type::template rebind_alloc<padded_element>;
+    using internal_collection_type = tbb::concurrent_vector< padded_element, padded_allocator_type >;
+
+    callback_base *my_construct_callback;
+
+    internal_collection_type my_locals;
+
+    // TODO: consider unifying the callback mechanism for all create_local* methods below
+    //   (likely non-compatible and requires interface version increase)
+    void* create_local() override {
+        padded_element& lref = *my_locals.grow_by(1);
+        my_construct_callback->construct(lref.value());
+        return lref.value_committed();
+    }
+
+    static void* create_local_by_copy( ets_base<ETS_key_type>& base, void* p ) {
+        enumerable_thread_specific& ets = static_cast<enumerable_thread_specific&>(base);
+        padded_element& lref = *ets.my_locals.grow_by(1);
+        new(lref.value()) T(*static_cast<T*>(p));
+        return lref.value_committed();
+    }
+
+    static void* create_local_by_move( ets_base<ETS_key_type>& base, void* p ) {
+        enumerable_thread_specific& ets = static_cast<enumerable_thread_specific&>(base);
+        padded_element& lref = *ets.my_locals.grow_by(1);
+        new(lref.value()) T(std::move(*static_cast<T*>(p)));
+        return lref.value_committed();
+    }
+
+    using array_allocator_type = typename allocator_traits_type::template rebind_alloc<uintptr_t>;
+
+    // _size is in bytes
+    void* create_array(std::size_t _size) override {
+        std::size_t nelements = (_size + sizeof(uintptr_t) -1) / sizeof(uintptr_t);
+        return array_allocator_type().allocate(nelements);
+    }
+
+    void free_array( void* _ptr, std::size_t _size) override {
+        std::size_t nelements = (_size + sizeof(uintptr_t) -1) / sizeof(uintptr_t);
+        array_allocator_type().deallocate( reinterpret_cast<uintptr_t *>(_ptr),nelements);
+    }
+
+public:
+
+    //! Basic types
+    using value_type = T;
+    using allocator_type = Allocator;
+    using size_type = typename internal_collection_type::size_type;
+    using difference_type = typename internal_collection_type::difference_type;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+
+    using pointer = typename allocator_traits_type::pointer;
+    using const_pointer = typename allocator_traits_type::const_pointer;
+
+    // Iterator types
+    using iterator = enumerable_thread_specific_iterator<internal_collection_type, value_type>;
+    using const_iterator = enumerable_thread_specific_iterator<internal_collection_type, const value_type>;
+
+    // Parallel range types
+    using range_type = generic_range_type<iterator>;
+    using const_range_type = generic_range_type<const_iterator>;
+
+    //! Default constructor.  Each local instance of T is default constructed.
+    enumerable_thread_specific() : my_construct_callback(
+        callback_leaf<construct_by_default<T> >::make(/*dummy argument*/0)
+    ){}
+
+    //! Constructor with initializer functor. Each local instance of T is constructed by T(finit()).
+    template <typename Finit , typename = typename std::enable_if<is_callable_no_args<typename std::decay<Finit>::type>::value>::type>
+    explicit enumerable_thread_specific( Finit finit ) : my_construct_callback(
+        callback_leaf<construct_by_finit<T,Finit> >::make( std::move(finit) )
+    ){}
+
+    //! Constructor with exemplar. Each local instance of T is copy-constructed from the exemplar.
+    explicit enumerable_thread_specific( const T& exemplar ) : my_construct_callback(
+        callback_leaf<construct_by_exemplar<T> >::make( exemplar )
+    ){}
+
+    explicit enumerable_thread_specific( T&& exemplar ) : my_construct_callback(
+        callback_leaf<construct_by_exemplar<T> >::make( std::move(exemplar) )
+    ){}
+
+    //! Variadic constructor with initializer arguments.  Each local instance of T is constructed by T(args...)
+    template <typename P1, typename... P,
+              typename = typename std::enable_if<!is_callable_no_args<typename std::decay<P1>::type>::value
+                                                      && !is_compatible_ets<T, typename std::decay<P1>::type>::value
+                                                      && !std::is_same<T, typename std::decay<P1>::type>::value
+                                                     >::type>
+    enumerable_thread_specific( P1&& arg1, P&& ... args ) : my_construct_callback(
+        callback_leaf<construct_by_args<T,P1,P...> >::make( std::forward<P1>(arg1), std::forward<P>(args)... )
+    ){}
+
+    //! Destructor
+    ~enumerable_thread_specific() {
+        if(my_construct_callback) my_construct_callback->destroy();
+        // Deallocate the hash table before overridden free_array() becomes inaccessible
+        this->ets_base<ETS_key_type>::table_clear();
+    }
+
+    //! returns reference to local, discarding exists
+    reference local() {
+        bool exists;
+        return local(exists);
+    }
+
+    //! Returns reference to calling thread's local copy, creating one if necessary
+    reference local(bool& exists)  {
+        void* ptr = this->table_lookup(exists);
+        return *(T*)ptr;
+    }
+
+    //! Get the number of local copies
+    size_type size() const { return my_locals.size(); }
+
+    //! true if there have been no local copies created
+    bool empty() const { return my_locals.empty(); }
+
+    //! begin iterator
+    iterator begin() { return iterator( my_locals, 0 ); }
+    //! end iterator
+    iterator end() { return iterator(my_locals, my_locals.size() ); }
+
+    //! begin const iterator
+    const_iterator begin() const { return const_iterator(my_locals, 0); }
+
+    //! end const iterator
+    const_iterator end() const { return const_iterator(my_locals, my_locals.size()); }
+
+    //! Get range for parallel algorithms
+    range_type range( std::size_t grainsize=1 ) { return range_type( begin(), end(), grainsize ); }
+
+    //! Get const range for parallel algorithms
+    const_range_type range( std::size_t grainsize=1 ) const { return const_range_type( begin(), end(), grainsize ); }
+
+    //! Destroys local copies
+    void clear() {
+        my_locals.clear();
+        this->table_clear();
+        // callback is not destroyed
+    }
+
+private:
+    template<typename A2, ets_key_usage_type C2>
+    void internal_copy(const enumerable_thread_specific<T, A2, C2>& other) {
+        // this tests is_compatible_ets
+        static_assert( (is_compatible_ets<T, typename std::decay<decltype(other)>::type>::value), "is_compatible_ets fails" );
+        // Initialize my_construct_callback first, so that it is valid even if rest of this routine throws an exception.
+        my_construct_callback = other.my_construct_callback->clone();
+        __TBB_ASSERT(my_locals.size()==0,NULL);
+        my_locals.reserve(other.size());
+        this->table_elementwise_copy( other, create_local_by_copy );
+    }
+
+    void internal_swap(enumerable_thread_specific& other) {
+        using std::swap;
+        __TBB_ASSERT( this!=&other, NULL );
+        swap(my_construct_callback, other.my_construct_callback);
+        // concurrent_vector::swap() preserves storage space,
+        // so addresses to the vector kept in ETS hash table remain valid.
+        swap(my_locals, other.my_locals);
+        this->ets_base<ETS_key_type>::table_swap(other);
+    }
+
+    template<typename A2, ets_key_usage_type C2>
+    void internal_move(enumerable_thread_specific<T, A2, C2>&& other) {
+        static_assert( (is_compatible_ets<T, typename std::decay<decltype(other)>::type>::value), "is_compatible_ets fails" );
+        my_construct_callback = other.my_construct_callback;
+        other.my_construct_callback = nullptr;
+        __TBB_ASSERT(my_locals.size()==0,NULL);
+        my_locals.reserve(other.size());
+        this->table_elementwise_copy( other, create_local_by_move );
+    }
+
+public:
+    enumerable_thread_specific( const enumerable_thread_specific& other )
+    : ets_base<ETS_key_type>() /* prevents GCC warnings with -Wextra */
+    {
+        internal_copy(other);
+    }
+
+    template<typename Alloc, ets_key_usage_type Cachetype>
+    enumerable_thread_specific( const enumerable_thread_specific<T, Alloc, Cachetype>& other )
+    {
+        internal_copy(other);
+    }
+
+    enumerable_thread_specific( enumerable_thread_specific&& other ) : my_construct_callback()
+    {
+        // TODO: use internal_move correctly here
+        internal_swap(other);
+    }
+
+    template<typename Alloc, ets_key_usage_type Cachetype>
+    enumerable_thread_specific( enumerable_thread_specific<T, Alloc, Cachetype>&& other ) : my_construct_callback()
+    {
+        internal_move(std::move(other));
+    }
+
+    enumerable_thread_specific& operator=( const enumerable_thread_specific& other )
+    {
+        if( this != &other ) {
+            this->clear();
+            my_construct_callback->destroy();
+            internal_copy( other );
+        }
+        return *this;
+    }
+
+    template<typename Alloc, ets_key_usage_type Cachetype>
+    enumerable_thread_specific& operator=( const enumerable_thread_specific<T, Alloc, Cachetype>& other )
+    {
+        __TBB_ASSERT( static_cast<void*>(this)!=static_cast<const void*>(&other), NULL ); // Objects of different types
+        this->clear();
+        my_construct_callback->destroy();
+        internal_copy(other);
+        return *this;
+    }
+
+    enumerable_thread_specific& operator=( enumerable_thread_specific&& other )
+    {
+        if( this != &other ) {
+            // TODO: use internal_move correctly here
+            internal_swap(other);
+        }
+        return *this;
+    }
+
+    template<typename Alloc, ets_key_usage_type Cachetype>
+    enumerable_thread_specific& operator=( enumerable_thread_specific<T, Alloc, Cachetype>&& other )
+    {
+        __TBB_ASSERT( static_cast<void*>(this)!=static_cast<const void*>(&other), NULL ); // Objects of different types
+        this->clear();
+        my_construct_callback->destroy();
+        internal_move(std::move(other));
+        return *this;
+    }
+
+    // CombineFunc has signature T(T,T) or T(const T&, const T&)
+    template <typename CombineFunc>
+    T combine(CombineFunc f_combine) {
+        if(begin() == end()) {
+            ets_element<T> location;
+            my_construct_callback->construct(location.value());
+            return *location.value_committed();
+        }
+        const_iterator ci = begin();
+        T my_result = *ci;
+        while(++ci != end())
+            my_result = f_combine( my_result, *ci );
+        return my_result;
+    }
+
+    // combine_func_t takes T by value or by [const] reference, and returns nothing
+    template <typename CombineFunc>
+    void combine_each(CombineFunc f_combine) {
+        for(iterator ci = begin(); ci != end(); ++ci) {
+            f_combine( *ci );
+        }
+    }
+
+}; // enumerable_thread_specific
+
+template< typename Container >
+class flattened2d {
+    // This intermediate typedef is to address issues with VC7.1 compilers
+    using conval_type = typename Container::value_type;
+
+public:
+    //! Basic types
+    using size_type = typename conval_type::size_type;
+    using difference_type = typename conval_type::difference_type;
+    using allocator_type = typename conval_type::allocator_type;
+    using value_type = typename conval_type::value_type;
+    using reference = typename conval_type::reference;
+    using const_reference = typename conval_type::const_reference;
+    using pointer = typename conval_type::pointer;
+    using const_pointer = typename conval_type::const_pointer;
+
+    using iterator = segmented_iterator<Container, value_type>;
+    using const_iterator = segmented_iterator<Container, const value_type>;
+
+    flattened2d( const Container &c, typename Container::const_iterator b, typename Container::const_iterator e ) :
+        my_container(const_cast<Container*>(&c)), my_begin(b), my_end(e) { }
+
+    explicit flattened2d( const Container &c ) :
+        my_container(const_cast<Container*>(&c)), my_begin(c.begin()), my_end(c.end()) { }
+
+    iterator begin() { return iterator(*my_container) = my_begin; }
+    iterator end() { return iterator(*my_container) = my_end; }
+    const_iterator begin() const { return const_iterator(*my_container) = my_begin; }
+    const_iterator end() const { return const_iterator(*my_container) = my_end; }
+
+    size_type size() const {
+        size_type tot_size = 0;
+        for(typename Container::const_iterator i = my_begin; i != my_end; ++i) {
+            tot_size += i->size();
+        }
+        return tot_size;
+    }
+
+private:
+    Container *my_container;
+    typename Container::const_iterator my_begin;
+    typename Container::const_iterator my_end;
+};
+
+template <typename Container>
+flattened2d<Container> flatten2d(const Container &c, const typename Container::const_iterator b, const typename Container::const_iterator e) {
+    return flattened2d<Container>(c, b, e);
+}
+
+template <typename Container>
+flattened2d<Container> flatten2d(const Container &c) {
+    return flattened2d<Container>(c);
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::enumerable_thread_specific;
+using detail::d1::flattened2d;
+using detail::d1::flatten2d;
+// ets enum keys
+using detail::d1::ets_key_usage_type;
+using detail::d1::ets_key_per_instance;
+using detail::d1::ets_no_key;
+#if __TBB_RESUMABLE_TASKS
+using detail::d1::ets_suspend_aware;
+#endif
+} // inline namespace v1
+
+} // namespace tbb
+
+#endif // __TBB_enumerable_thread_specific_H
+
diff --git a/contrib/libs/tbb/include/oneapi/tbb/flow_graph.h b/contrib/libs/tbb/include/oneapi/tbb/flow_graph.h
new file mode 100644
index 0000000000..cc2cc7b605
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/flow_graph.h
@@ -0,0 +1,3221 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_flow_graph_H
+#define __TBB_flow_graph_H
+
+#include <atomic>
+#include <memory>
+#include <type_traits>
+
+#include "detail/_config.h"
+#include "detail/_namespace_injection.h"
+#include "spin_mutex.h"
+#include "null_mutex.h"
+#include "spin_rw_mutex.h"
+#include "null_rw_mutex.h"
+#include "detail/_pipeline_filters.h"
+#include "detail/_task.h"
+#include "detail/_small_object_pool.h"
+#include "cache_aligned_allocator.h"
+#include "detail/_exception.h"
+#include "detail/_template_helpers.h"
+#include "detail/_aggregator.h"
+#include "detail/_allocator_traits.h"
+#include "profiling.h"
+#include "task_arena.h"
+
+#if TBB_USE_PROFILING_TOOLS && ( __linux__ || __APPLE__ )
+   #if __INTEL_COMPILER
+       // Disabled warning "routine is both inline and noinline"
+       #pragma warning (push)
+       #pragma warning( disable: 2196 )
+   #endif
+   #define __TBB_NOINLINE_SYM __attribute__((noinline))
+#else
+   #define __TBB_NOINLINE_SYM
+#endif
+
+#include <tuple>
+#include <list>
+#include <queue>
+
+/** @file
+  \brief The graph related classes and functions
+
+  There are some applications that best express dependencies as messages
+  passed between nodes in a graph.  These messages may contain data or
+  simply act as signals that a predecessors has completed. The graph
+  class and its associated node classes can be used to express such
+  applications.
+*/
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+
+//! An enumeration the provides the two most common concurrency levels: unlimited and serial
+enum concurrency { unlimited = 0, serial = 1 };
+
+//! A generic null type
+struct null_type {};
+
+//! An empty class used for messages that mean "I'm done"
+class continue_msg {};
+
+//! Forward declaration section
+template< typename T > class sender;
+template< typename T > class receiver;
+class continue_receiver;
+
+template< typename T, typename U > class limiter_node;  // needed for resetting decrementer
+
+template<typename T, typename M> class successor_cache;
+template<typename T, typename M> class broadcast_cache;
+template<typename T, typename M> class round_robin_cache;
+template<typename T, typename M> class predecessor_cache;
+template<typename T, typename M> class reservable_predecessor_cache;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+namespace order {
+struct following;
+struct preceding;
+}
+template<typename Order, typename... Args> struct node_set;
+#endif
+
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+//! The graph class
+#include "detail/_flow_graph_impl.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+static inline std::pair<graph_task*, graph_task*> order_tasks(graph_task* first, graph_task* second) {
+    if (second->priority > first->priority)
+        return std::make_pair(second, first);
+    return std::make_pair(first, second);
+}
+
+// submit task if necessary. Returns the non-enqueued task if there is one.
+static inline graph_task* combine_tasks(graph& g, graph_task* left, graph_task* right) {
+    // if no RHS task, don't change left.
+    if (right == NULL) return left;
+    // right != NULL
+    if (left == NULL) return right;
+    if (left == SUCCESSFULLY_ENQUEUED) return right;
+    // left contains a task
+    if (right != SUCCESSFULLY_ENQUEUED) {
+        // both are valid tasks
+        auto tasks_pair = order_tasks(left, right);
+        spawn_in_graph_arena(g, *tasks_pair.first);
+        return tasks_pair.second;
+    }
+    return left;
+}
+
+//! Pure virtual template class that defines a sender of messages of type T
+template< typename T >
+class sender {
+public:
+    virtual ~sender() {}
+
+    //! Request an item from the sender
+    virtual bool try_get( T & ) { return false; }
+
+    //! Reserves an item in the sender
+    virtual bool try_reserve( T & ) { return false; }
+
+    //! Releases the reserved item
+    virtual bool try_release( ) { return false; }
+
+    //! Consumes the reserved item
+    virtual bool try_consume( ) { return false; }
+
+protected:
+    //! The output type of this sender
+    typedef T output_type;
+
+    //! The successor type for this node
+    typedef receiver<T> successor_type;
+
+    //! Add a new successor to this node
+    virtual bool register_successor( successor_type &r ) = 0;
+
+    //! Removes a successor from this node
+    virtual bool remove_successor( successor_type &r ) = 0;
+
+    template<typename C>
+    friend bool register_successor(sender<C>& s, receiver<C>& r);
+
+    template<typename C>
+    friend bool remove_successor  (sender<C>& s, receiver<C>& r);
+};  // class sender<T>
+
+template<typename C>
+bool register_successor(sender<C>& s, receiver<C>& r) {
+    return s.register_successor(r);
+}
+
+template<typename C>
+bool remove_successor(sender<C>& s, receiver<C>& r) {
+    return s.remove_successor(r);
+}
+
+//! Pure virtual template class that defines a receiver of messages of type T
+template< typename T >
+class receiver {
+public:
+    //! Destructor
+    virtual ~receiver() {}
+
+    //! Put an item to the receiver
+    bool try_put( const T& t ) {
+        graph_task *res = try_put_task(t);
+        if (!res) return false;
+        if (res != SUCCESSFULLY_ENQUEUED) spawn_in_graph_arena(graph_reference(), *res);
+        return true;
+    }
+
+    //! put item to successor; return task to run the successor if possible.
+protected:
+    //! The input type of this receiver
+    typedef T input_type;
+
+    //! The predecessor type for this node
+    typedef sender<T> predecessor_type;
+
+    template< typename R, typename B > friend class run_and_put_task;
+    template< typename X, typename Y > friend class broadcast_cache;
+    template< typename X, typename Y > friend class round_robin_cache;
+    virtual graph_task *try_put_task(const T& t) = 0;
+    virtual graph& graph_reference() const = 0;
+
+    template<typename TT, typename M> friend class successor_cache;
+    virtual bool is_continue_receiver() { return false; }
+
+    // TODO revamp: reconsider the inheritance and move node priority out of receiver
+    virtual node_priority_t priority() const { return no_priority; }
+
+    //! Add a predecessor to the node
+    virtual bool register_predecessor( predecessor_type & ) { return false; }
+
+    //! Remove a predecessor from the node
+    virtual bool remove_predecessor( predecessor_type & ) { return false; }
+
+    template <typename C>
+    friend bool register_predecessor(receiver<C>& r, sender<C>& s);
+    template <typename C>
+    friend bool remove_predecessor  (receiver<C>& r, sender<C>& s);
+}; // class receiver<T>
+
+template <typename C>
+bool register_predecessor(receiver<C>& r, sender<C>& s) {
+    return r.register_predecessor(s);
+}
+
+template <typename C>
+bool remove_predecessor(receiver<C>& r, sender<C>& s) {
+    return r.remove_predecessor(s);
+}
+
+//! Base class for receivers of completion messages
+/** These receivers automatically reset, but cannot be explicitly waited on */
+class continue_receiver : public receiver< continue_msg > {
+protected:
+
+    //! Constructor
+    explicit continue_receiver( int number_of_predecessors, node_priority_t a_priority ) {
+        my_predecessor_count = my_initial_predecessor_count = number_of_predecessors;
+        my_current_count = 0;
+        my_priority = a_priority;
+    }
+
+    //! Copy constructor
+    continue_receiver( const continue_receiver& src ) : receiver<continue_msg>() {
+        my_predecessor_count = my_initial_predecessor_count = src.my_initial_predecessor_count;
+        my_current_count = 0;
+        my_priority = src.my_priority;
+    }
+
+    //! Increments the trigger threshold
+    bool register_predecessor( predecessor_type & ) override {
+        spin_mutex::scoped_lock l(my_mutex);
+        ++my_predecessor_count;
+        return true;
+    }
+
+    //! Decrements the trigger threshold
+    /** Does not check to see if the removal of the predecessor now makes the current count
+        exceed the new threshold.  So removing a predecessor while the graph is active can cause
+        unexpected results. */
+    bool remove_predecessor( predecessor_type & ) override {
+        spin_mutex::scoped_lock l(my_mutex);
+        --my_predecessor_count;
+        return true;
+    }
+
+    //! The input type
+    typedef continue_msg input_type;
+
+    //! The predecessor type for this node
+    typedef receiver<input_type>::predecessor_type predecessor_type;
+
+    template< typename R, typename B > friend class run_and_put_task;
+    template<typename X, typename Y> friend class broadcast_cache;
+    template<typename X, typename Y> friend class round_robin_cache;
+    // execute body is supposed to be too small to create a task for.
+    graph_task* try_put_task( const input_type & ) override {
+        {
+            spin_mutex::scoped_lock l(my_mutex);
+            if ( ++my_current_count < my_predecessor_count )
+                return SUCCESSFULLY_ENQUEUED;
+            else
+                my_current_count = 0;
+        }
+        graph_task* res = execute();
+        return res? res : SUCCESSFULLY_ENQUEUED;
+    }
+
+    spin_mutex my_mutex;
+    int my_predecessor_count;
+    int my_current_count;
+    int my_initial_predecessor_count;
+    node_priority_t my_priority;
+    // the friend declaration in the base class did not eliminate the "protected class"
+    // error in gcc 4.1.2
+    template<typename U, typename V> friend class limiter_node;
+
+    virtual void reset_receiver( reset_flags f ) {
+        my_current_count = 0;
+        if (f & rf_clear_edges) {
+            my_predecessor_count = my_initial_predecessor_count;
+        }
+    }
+
+    //! Does whatever should happen when the threshold is reached
+    /** This should be very fast or else spawn a task.  This is
+        called while the sender is blocked in the try_put(). */
+    virtual graph_task* execute() = 0;
+    template<typename TT, typename M> friend class successor_cache;
+    bool is_continue_receiver() override { return true; }
+
+    node_priority_t priority() const override { return my_priority; }
+}; // class continue_receiver
+
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+    template <typename K, typename T>
+    K key_from_message( const T &t ) {
+        return t.key();
+    }
+#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+
+} // d1
+} // detail
+} // tbb
+
+#include "detail/_flow_graph_trace_impl.h"
+#include "detail/_hash_compare.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+#include "detail/_flow_graph_body_impl.h"
+#include "detail/_flow_graph_cache_impl.h"
+#include "detail/_flow_graph_types_impl.h"
+
+using namespace graph_policy_namespace;
+
+template <typename C, typename N>
+graph_iterator<C,N>::graph_iterator(C *g, bool begin) : my_graph(g), current_node(NULL)
+{
+    if (begin) current_node = my_graph->my_nodes;
+    //else it is an end iterator by default
+}
+
+template <typename C, typename N>
+typename graph_iterator<C,N>::reference graph_iterator<C,N>::operator*() const {
+    __TBB_ASSERT(current_node, "graph_iterator at end");
+    return *operator->();
+}
+
+template <typename C, typename N>
+typename graph_iterator<C,N>::pointer graph_iterator<C,N>::operator->() const {
+    return current_node;
+}
+
+template <typename C, typename N>
+void graph_iterator<C,N>::internal_forward() {
+    if (current_node) current_node = current_node->next;
+}
+
+//! Constructs a graph with isolated task_group_context
+inline graph::graph() : my_wait_context(0), my_nodes(NULL), my_nodes_last(NULL), my_task_arena(NULL) {
+    prepare_task_arena();
+    own_context = true;
+    cancelled = false;
+    caught_exception = false;
+    my_context = new (r1::cache_aligned_allocate(sizeof(task_group_context))) task_group_context(FLOW_TASKS);
+    fgt_graph(this);
+    my_is_active = true;
+}
+
+inline graph::graph(task_group_context& use_this_context) :
+    my_wait_context(0), my_context(&use_this_context), my_nodes(NULL), my_nodes_last(NULL), my_task_arena(NULL) {
+    prepare_task_arena();
+    own_context = false;
+    cancelled = false;
+    caught_exception = false;
+    fgt_graph(this);
+    my_is_active = true;
+}
+
+inline graph::~graph() {
+    wait_for_all();
+    if (own_context) {
+        my_context->~task_group_context();
+        r1::cache_aligned_deallocate(my_context);
+    }
+    delete my_task_arena;
+}
+
+inline void graph::reserve_wait() {
+    my_wait_context.reserve();
+    fgt_reserve_wait(this);
+}
+
+inline void graph::release_wait() {
+    fgt_release_wait(this);
+    my_wait_context.release();
+}
+
+inline void graph::register_node(graph_node *n) {
+    n->next = NULL;
+    {
+        spin_mutex::scoped_lock lock(nodelist_mutex);
+        n->prev = my_nodes_last;
+        if (my_nodes_last) my_nodes_last->next = n;
+        my_nodes_last = n;
+        if (!my_nodes) my_nodes = n;
+    }
+}
+
+inline void graph::remove_node(graph_node *n) {
+    {
+        spin_mutex::scoped_lock lock(nodelist_mutex);
+        __TBB_ASSERT(my_nodes && my_nodes_last, "graph::remove_node: Error: no registered nodes");
+        if (n->prev) n->prev->next = n->next;
+        if (n->next) n->next->prev = n->prev;
+        if (my_nodes_last == n) my_nodes_last = n->prev;
+        if (my_nodes == n) my_nodes = n->next;
+    }
+    n->prev = n->next = NULL;
+}
+
+inline void graph::reset( reset_flags f ) {
+    // reset context
+    deactivate_graph(*this);
+
+    my_context->reset();
+    cancelled = false;
+    caught_exception = false;
+    // reset all the nodes comprising the graph
+    for(iterator ii = begin(); ii != end(); ++ii) {
+        graph_node *my_p = &(*ii);
+        my_p->reset_node(f);
+    }
+    // Reattach the arena. Might be useful to run the graph in a particular task_arena
+    // while not limiting graph lifetime to a single task_arena::execute() call.
+    prepare_task_arena( /*reinit=*/true );
+    activate_graph(*this);
+}
+
+inline void graph::cancel() {
+    my_context->cancel_group_execution();
+}
+
+inline graph::iterator graph::begin() { return iterator(this, true); }
+
+inline graph::iterator graph::end() { return iterator(this, false); }
+
+inline graph::const_iterator graph::begin() const { return const_iterator(this, true); }
+
+inline graph::const_iterator graph::end() const { return const_iterator(this, false); }
+
+inline graph::const_iterator graph::cbegin() const { return const_iterator(this, true); }
+
+inline graph::const_iterator graph::cend() const { return const_iterator(this, false); }
+
+inline graph_node::graph_node(graph& g) : my_graph(g) {
+    my_graph.register_node(this);
+}
+
+inline graph_node::~graph_node() {
+    my_graph.remove_node(this);
+}
+
+#include "detail/_flow_graph_node_impl.h"
+
+
+//! An executable node that acts as a source, i.e. it has no predecessors
+
+template < typename Output >
+class input_node : public graph_node, public sender< Output > {
+public:
+    //! The type of the output message, which is complete
+    typedef Output output_type;
+
+    //! The type of successors of this node
+    typedef typename sender<output_type>::successor_type successor_type;
+
+    // Input node has no input type
+    typedef null_type input_type;
+
+    //! Constructor for a node with a successor
+    template< typename Body >
+     __TBB_NOINLINE_SYM input_node( graph &g, Body body )
+         : graph_node(g), my_active(false)
+         , my_body( new input_body_leaf< output_type, Body>(body) )
+         , my_init_body( new input_body_leaf< output_type, Body>(body) )
+         , my_successors(this), my_reserved(false), my_has_cached_item(false)
+    {
+        fgt_node_with_body(CODEPTR(), FLOW_INPUT_NODE, &this->my_graph,
+                           static_cast<sender<output_type> *>(this), this->my_body);
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename Body, typename... Successors>
+    input_node( const node_set<order::preceding, Successors...>& successors, Body body )
+        : input_node(successors.graph_reference(), body)
+    {
+        make_edges(*this, successors);
+    }
+#endif
+
+    //! Copy constructor
+    __TBB_NOINLINE_SYM input_node( const input_node& src )
+        : graph_node(src.my_graph), sender<Output>()
+        , my_active(false)
+        , my_body(src.my_init_body->clone()), my_init_body(src.my_init_body->clone())
+        , my_successors(this), my_reserved(false), my_has_cached_item(false)
+    {
+        fgt_node_with_body(CODEPTR(), FLOW_INPUT_NODE, &this->my_graph,
+                           static_cast<sender<output_type> *>(this), this->my_body);
+    }
+
+    //! The destructor
+    ~input_node() { delete my_body; delete my_init_body; }
+
+    //! Add a new successor to this node
+    bool register_successor( successor_type &r ) override {
+        spin_mutex::scoped_lock lock(my_mutex);
+        my_successors.register_successor(r);
+        if ( my_active )
+            spawn_put();
+        return true;
+    }
+
+    //! Removes a successor from this node
+    bool remove_successor( successor_type &r ) override {
+        spin_mutex::scoped_lock lock(my_mutex);
+        my_successors.remove_successor(r);
+        return true;
+    }
+
+    //! Request an item from the node
+    bool try_get( output_type &v ) override {
+        spin_mutex::scoped_lock lock(my_mutex);
+        if ( my_reserved )
+            return false;
+
+        if ( my_has_cached_item ) {
+            v = my_cached_item;
+            my_has_cached_item = false;
+            return true;
+        }
+        // we've been asked to provide an item, but we have none.  enqueue a task to
+        // provide one.
+        if ( my_active )
+            spawn_put();
+        return false;
+    }
+
+    //! Reserves an item.
+    bool try_reserve( output_type &v ) override {
+        spin_mutex::scoped_lock lock(my_mutex);
+        if ( my_reserved ) {
+            return false;
+        }
+
+        if ( my_has_cached_item ) {
+            v = my_cached_item;
+            my_reserved = true;
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    //! Release a reserved item.
+    /** true = item has been released and so remains in sender, dest must request or reserve future items */
+    bool try_release( ) override {
+        spin_mutex::scoped_lock lock(my_mutex);
+        __TBB_ASSERT( my_reserved && my_has_cached_item, "releasing non-existent reservation" );
+        my_reserved = false;
+        if(!my_successors.empty())
+            spawn_put();
+        return true;
+    }
+
+    //! Consumes a reserved item
+    bool try_consume( ) override {
+        spin_mutex::scoped_lock lock(my_mutex);
+        __TBB_ASSERT( my_reserved && my_has_cached_item, "consuming non-existent reservation" );
+        my_reserved = false;
+        my_has_cached_item = false;
+        if ( !my_successors.empty() ) {
+            spawn_put();
+        }
+        return true;
+    }
+
+    //! Activates a node that was created in the inactive state
+    void activate() {
+        spin_mutex::scoped_lock lock(my_mutex);
+        my_active = true;
+        if (!my_successors.empty())
+            spawn_put();
+    }
+
+    template<typename Body>
+    Body copy_function_object() {
+        input_body<output_type> &body_ref = *this->my_body;
+        return dynamic_cast< input_body_leaf<output_type, Body> & >(body_ref).get_body();
+    }
+
+protected:
+
+    //! resets the input_node to its initial state
+    void reset_node( reset_flags f) override {
+        my_active = false;
+        my_reserved = false;
+        my_has_cached_item = false;
+
+        if(f & rf_clear_edges) my_successors.clear();
+        if(f & rf_reset_bodies) {
+            input_body<output_type> *tmp = my_init_body->clone();
+            delete my_body;
+            my_body = tmp;
+        }
+    }
+
+private:
+    spin_mutex my_mutex;
+    bool my_active;
+    input_body<output_type> *my_body;
+    input_body<output_type> *my_init_body;
+    broadcast_cache< output_type > my_successors;
+    bool my_reserved;
+    bool my_has_cached_item;
+    output_type my_cached_item;
+
+    // used by apply_body_bypass, can invoke body of node.
+    bool try_reserve_apply_body(output_type &v) {
+        spin_mutex::scoped_lock lock(my_mutex);
+        if ( my_reserved ) {
+            return false;
+        }
+        if ( !my_has_cached_item ) {
+            flow_control control;
+
+            fgt_begin_body( my_body );
+
+            my_cached_item = (*my_body)(control);
+            my_has_cached_item = !control.is_pipeline_stopped;
+
+            fgt_end_body( my_body );
+        }
+        if ( my_has_cached_item ) {
+            v = my_cached_item;
+            my_reserved = true;
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    graph_task* create_put_task() {
+        small_object_allocator allocator{};
+        typedef input_node_task_bypass< input_node<output_type> > task_type;
+        graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
+        my_graph.reserve_wait();
+        return t;
+    }
+
+    //! Spawns a task that applies the body
+    void spawn_put( ) {
+        if(is_graph_active(this->my_graph)) {
+            spawn_in_graph_arena(this->my_graph, *create_put_task());
+        }
+    }
+
+    friend class input_node_task_bypass< input_node<output_type> >;
+    //! Applies the body.  Returning SUCCESSFULLY_ENQUEUED okay; forward_task_bypass will handle it.
+    graph_task* apply_body_bypass( ) {
+        output_type v;
+        if ( !try_reserve_apply_body(v) )
+            return NULL;
+
+        graph_task *last_task = my_successors.try_put_task(v);
+        if ( last_task )
+            try_consume();
+        else
+            try_release();
+        return last_task;
+    }
+};  // class input_node
+
+//! Implements a function node that supports Input -> Output
+template<typename Input, typename Output = continue_msg, typename Policy = queueing>
+class function_node
+    : public graph_node
+    , public function_input< Input, Output, Policy, cache_aligned_allocator<Input> >
+    , public function_output<Output>
+{
+    typedef cache_aligned_allocator<Input> internals_allocator;
+
+public:
+    typedef Input input_type;
+    typedef Output output_type;
+    typedef function_input<input_type,output_type,Policy,internals_allocator> input_impl_type;
+    typedef function_input_queue<input_type, internals_allocator> input_queue_type;
+    typedef function_output<output_type> fOutput_type;
+    typedef typename input_impl_type::predecessor_type predecessor_type;
+    typedef typename fOutput_type::successor_type successor_type;
+
+    using input_impl_type::my_predecessors;
+
+    //! Constructor
+    // input_queue_type is allocated here, but destroyed in the function_input_base.
+    // TODO: pass the graph_buffer_policy to the function_input_base so it can all
+    // be done in one place.  This would be an interface-breaking change.
+    template< typename Body >
+     __TBB_NOINLINE_SYM function_node( graph &g, size_t concurrency,
+                   Body body, Policy = Policy(), node_priority_t a_priority = no_priority )
+        : graph_node(g), input_impl_type(g, concurrency, body, a_priority),
+          fOutput_type(g) {
+        fgt_node_with_body( CODEPTR(), FLOW_FUNCTION_NODE, &this->my_graph,
+                static_cast<receiver<input_type> *>(this), static_cast<sender<output_type> *>(this), this->my_body );
+    }
+
+    template <typename Body>
+    function_node( graph& g, size_t concurrency, Body body, node_priority_t a_priority )
+        : function_node(g, concurrency, body, Policy(), a_priority) {}
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename Body, typename... Args>
+    function_node( const node_set<Args...>& nodes, size_t concurrency, Body body,
+                   Policy p = Policy(), node_priority_t a_priority = no_priority )
+        : function_node(nodes.graph_reference(), concurrency, body, p, a_priority) {
+        make_edges_in_order(nodes, *this);
+    }
+
+    template <typename Body, typename... Args>
+    function_node( const node_set<Args...>& nodes, size_t concurrency, Body body, node_priority_t a_priority )
+        : function_node(nodes, concurrency, body, Policy(), a_priority) {}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+    //! Copy constructor
+    __TBB_NOINLINE_SYM function_node( const function_node& src ) :
+        graph_node(src.my_graph),
+        input_impl_type(src),
+        fOutput_type(src.my_graph) {
+        fgt_node_with_body( CODEPTR(), FLOW_FUNCTION_NODE, &this->my_graph,
+                static_cast<receiver<input_type> *>(this), static_cast<sender<output_type> *>(this), this->my_body );
+    }
+
+protected:
+    template< typename R, typename B > friend class run_and_put_task;
+    template<typename X, typename Y> friend class broadcast_cache;
+    template<typename X, typename Y> friend class round_robin_cache;
+    using input_impl_type::try_put_task;
+
+    broadcast_cache<output_type> &successors () override { return fOutput_type::my_successors; }
+
+    void reset_node(reset_flags f) override {
+        input_impl_type::reset_function_input(f);
+        // TODO: use clear() instead.
+        if(f & rf_clear_edges) {
+            successors().clear();
+            my_predecessors.clear();
+        }
+        __TBB_ASSERT(!(f & rf_clear_edges) || successors().empty(), "function_node successors not empty");
+        __TBB_ASSERT(this->my_predecessors.empty(), "function_node predecessors not empty");
+    }
+
+};  // class function_node
+
+//! implements a function node that supports Input -> (set of outputs)
+// Output is a tuple of output types.
+template<typename Input, typename Output, typename Policy = queueing>
+class multifunction_node :
+    public graph_node,
+    public multifunction_input
+    <
+        Input,
+        typename wrap_tuple_elements<
+            std::tuple_size<Output>::value,  // #elements in tuple
+            multifunction_output,  // wrap this around each element
+            Output // the tuple providing the types
+        >::type,
+        Policy,
+        cache_aligned_allocator<Input>
+    >
+{
+    typedef cache_aligned_allocator<Input> internals_allocator;
+
+protected:
+    static const int N = std::tuple_size<Output>::value;
+public:
+    typedef Input input_type;
+    typedef null_type output_type;
+    typedef typename wrap_tuple_elements<N,multifunction_output, Output>::type output_ports_type;
+    typedef multifunction_input<
+        input_type, output_ports_type, Policy, internals_allocator> input_impl_type;
+    typedef function_input_queue<input_type, internals_allocator> input_queue_type;
+private:
+    using input_impl_type::my_predecessors;
+public:
+    template<typename Body>
+    __TBB_NOINLINE_SYM multifunction_node(
+        graph &g, size_t concurrency,
+        Body body, Policy = Policy(), node_priority_t a_priority = no_priority
+    ) : graph_node(g), input_impl_type(g, concurrency, body, a_priority) {
+        fgt_multioutput_node_with_body<N>(
+            CODEPTR(), FLOW_MULTIFUNCTION_NODE,
+            &this->my_graph, static_cast<receiver<input_type> *>(this),
+            this->output_ports(), this->my_body
+        );
+    }
+
+    template <typename Body>
+    __TBB_NOINLINE_SYM multifunction_node(graph& g, size_t concurrency, Body body, node_priority_t a_priority)
+        : multifunction_node(g, concurrency, body, Policy(), a_priority) {}
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename Body, typename... Args>
+    __TBB_NOINLINE_SYM multifunction_node(const node_set<Args...>& nodes, size_t concurrency, Body body,
+                       Policy p = Policy(), node_priority_t a_priority = no_priority)
+        : multifunction_node(nodes.graph_reference(), concurrency, body, p, a_priority) {
+        make_edges_in_order(nodes, *this);
+    }
+
+    template <typename Body, typename... Args>
+    __TBB_NOINLINE_SYM multifunction_node(const node_set<Args...>& nodes, size_t concurrency, Body body, node_priority_t a_priority)
+        : multifunction_node(nodes, concurrency, body, Policy(), a_priority) {}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+    __TBB_NOINLINE_SYM multifunction_node( const multifunction_node &other) :
+        graph_node(other.my_graph), input_impl_type(other) {
+        fgt_multioutput_node_with_body<N>( CODEPTR(), FLOW_MULTIFUNCTION_NODE,
+                &this->my_graph, static_cast<receiver<input_type> *>(this),
+                this->output_ports(), this->my_body );
+    }
+
+    // all the guts are in multifunction_input...
+protected:
+    void reset_node(reset_flags f) override { input_impl_type::reset(f); }
+};  // multifunction_node
+
+//! split_node: accepts a tuple as input, forwards each element of the tuple to its
+//  successors.  The node has unlimited concurrency, so it does not reject inputs.
+template<typename TupleType>
+class split_node : public graph_node, public receiver<TupleType> {
+    static const int N = std::tuple_size<TupleType>::value;
+    typedef receiver<TupleType> base_type;
+public:
+    typedef TupleType input_type;
+    typedef typename wrap_tuple_elements<
+            N,  // #elements in tuple
+            multifunction_output,  // wrap this around each element
+            TupleType // the tuple providing the types
+        >::type  output_ports_type;
+
+    __TBB_NOINLINE_SYM explicit split_node(graph &g)
+        : graph_node(g),
+          my_output_ports(init_output_ports<output_ports_type>::call(g, my_output_ports))
+    {
+        fgt_multioutput_node<N>(CODEPTR(), FLOW_SPLIT_NODE, &this->my_graph,
+            static_cast<receiver<input_type> *>(this), this->output_ports());
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    __TBB_NOINLINE_SYM split_node(const node_set<Args...>& nodes) : split_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    __TBB_NOINLINE_SYM split_node(const split_node& other)
+        : graph_node(other.my_graph), base_type(other),
+          my_output_ports(init_output_ports<output_ports_type>::call(other.my_graph, my_output_ports))
+    {
+        fgt_multioutput_node<N>(CODEPTR(), FLOW_SPLIT_NODE, &this->my_graph,
+            static_cast<receiver<input_type> *>(this), this->output_ports());
+    }
+
+    output_ports_type &output_ports() { return my_output_ports; }
+
+protected:
+    graph_task *try_put_task(const TupleType& t) override {
+        // Sending split messages in parallel is not justified, as overheads would prevail.
+        // Also, we do not have successors here. So we just tell the task returned here is successful.
+        return emit_element<N>::emit_this(this->my_graph, t, output_ports());
+    }
+    void reset_node(reset_flags f) override {
+        if (f & rf_clear_edges)
+            clear_element<N>::clear_this(my_output_ports);
+
+        __TBB_ASSERT(!(f & rf_clear_edges) || clear_element<N>::this_empty(my_output_ports), "split_node reset failed");
+    }
+    graph& graph_reference() const override {
+        return my_graph;
+    }
+
+private:
+    output_ports_type my_output_ports;
+};
+
+//! Implements an executable node that supports continue_msg -> Output
+template <typename Output, typename Policy = Policy<void> >
+class continue_node : public graph_node, public continue_input<Output, Policy>,
+                      public function_output<Output> {
+public:
+    typedef continue_msg input_type;
+    typedef Output output_type;
+    typedef continue_input<Output, Policy> input_impl_type;
+    typedef function_output<output_type> fOutput_type;
+    typedef typename input_impl_type::predecessor_type predecessor_type;
+    typedef typename fOutput_type::successor_type successor_type;
+
+    //! Constructor for executable node with continue_msg -> Output
+    template <typename Body >
+    __TBB_NOINLINE_SYM continue_node(
+        graph &g,
+        Body body, Policy = Policy(), node_priority_t a_priority = no_priority
+    ) : graph_node(g), input_impl_type( g, body, a_priority ),
+        fOutput_type(g) {
+        fgt_node_with_body( CODEPTR(), FLOW_CONTINUE_NODE, &this->my_graph,
+
+                                           static_cast<receiver<input_type> *>(this),
+                                           static_cast<sender<output_type> *>(this), this->my_body );
+    }
+
+    template <typename Body>
+    continue_node( graph& g, Body body, node_priority_t a_priority )
+        : continue_node(g, body, Policy(), a_priority) {}
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename Body, typename... Args>
+    continue_node( const node_set<Args...>& nodes, Body body,
+                   Policy p = Policy(), node_priority_t a_priority = no_priority )
+        : continue_node(nodes.graph_reference(), body, p, a_priority ) {
+        make_edges_in_order(nodes, *this);
+    }
+    template <typename Body, typename... Args>
+    continue_node( const node_set<Args...>& nodes, Body body, node_priority_t a_priority)
+        : continue_node(nodes, body, Policy(), a_priority) {}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+    //! Constructor for executable node with continue_msg -> Output
+    template <typename Body >
+    __TBB_NOINLINE_SYM continue_node(
+        graph &g, int number_of_predecessors,
+        Body body, Policy = Policy(), node_priority_t a_priority = no_priority
+    ) : graph_node(g)
+      , input_impl_type(g, number_of_predecessors, body, a_priority),
+        fOutput_type(g) {
+        fgt_node_with_body( CODEPTR(), FLOW_CONTINUE_NODE, &this->my_graph,
+                                           static_cast<receiver<input_type> *>(this),
+                                           static_cast<sender<output_type> *>(this), this->my_body );
+    }
+
+    template <typename Body>
+    continue_node( graph& g, int number_of_predecessors, Body body, node_priority_t a_priority)
+        : continue_node(g, number_of_predecessors, body, Policy(), a_priority) {}
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename Body, typename... Args>
+    continue_node( const node_set<Args...>& nodes, int number_of_predecessors,
+                   Body body, Policy p = Policy(), node_priority_t a_priority = no_priority )
+        : continue_node(nodes.graph_reference(), number_of_predecessors, body, p, a_priority) {
+        make_edges_in_order(nodes, *this);
+    }
+
+    template <typename Body, typename... Args>
+    continue_node( const node_set<Args...>& nodes, int number_of_predecessors,
+                   Body body, node_priority_t a_priority )
+        : continue_node(nodes, number_of_predecessors, body, Policy(), a_priority) {}
+#endif
+
+    //! Copy constructor
+    __TBB_NOINLINE_SYM continue_node( const continue_node& src ) :
+        graph_node(src.my_graph), input_impl_type(src),
+        function_output<Output>(src.my_graph) {
+        fgt_node_with_body( CODEPTR(), FLOW_CONTINUE_NODE, &this->my_graph,
+                                           static_cast<receiver<input_type> *>(this),
+                                           static_cast<sender<output_type> *>(this), this->my_body );
+    }
+
+protected:
+    template< typename R, typename B > friend class run_and_put_task;
+    template<typename X, typename Y> friend class broadcast_cache;
+    template<typename X, typename Y> friend class round_robin_cache;
+    using input_impl_type::try_put_task;
+    broadcast_cache<output_type> &successors () override { return fOutput_type::my_successors; }
+
+    void reset_node(reset_flags f) override {
+        input_impl_type::reset_receiver(f);
+        if(f & rf_clear_edges)successors().clear();
+        __TBB_ASSERT(!(f & rf_clear_edges) || successors().empty(), "continue_node not reset");
+    }
+};  // continue_node
+
+//! Forwards messages of type T to all successors
+template <typename T>
+class broadcast_node : public graph_node, public receiver<T>, public sender<T> {
+public:
+    typedef T input_type;
+    typedef T output_type;
+    typedef typename receiver<input_type>::predecessor_type predecessor_type;
+    typedef typename sender<output_type>::successor_type successor_type;
+private:
+    broadcast_cache<input_type> my_successors;
+public:
+
+    __TBB_NOINLINE_SYM explicit broadcast_node(graph& g) : graph_node(g), my_successors(this) {
+        fgt_node( CODEPTR(), FLOW_BROADCAST_NODE, &this->my_graph,
+                  static_cast<receiver<input_type> *>(this), static_cast<sender<output_type> *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    broadcast_node(const node_set<Args...>& nodes) : broadcast_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    // Copy constructor
+    __TBB_NOINLINE_SYM broadcast_node( const broadcast_node& src ) : broadcast_node(src.my_graph) {}
+
+    //! Adds a successor
+    bool register_successor( successor_type &r ) override {
+        my_successors.register_successor( r );
+        return true;
+    }
+
+    //! Removes s as a successor
+    bool remove_successor( successor_type &r ) override {
+        my_successors.remove_successor( r );
+        return true;
+    }
+
+protected:
+    template< typename R, typename B > friend class run_and_put_task;
+    template<typename X, typename Y> friend class broadcast_cache;
+    template<typename X, typename Y> friend class round_robin_cache;
+    //! build a task to run the successor if possible.  Default is old behavior.
+    graph_task *try_put_task(const T& t) override {
+        graph_task *new_task = my_successors.try_put_task(t);
+        if (!new_task) new_task = SUCCESSFULLY_ENQUEUED;
+        return new_task;
+    }
+
+    graph& graph_reference() const override {
+        return my_graph;
+    }
+
+    void reset_node(reset_flags f) override {
+        if (f&rf_clear_edges) {
+           my_successors.clear();
+        }
+        __TBB_ASSERT(!(f & rf_clear_edges) || my_successors.empty(), "Error resetting broadcast_node");
+    }
+};  // broadcast_node
+
+//! Forwards messages in arbitrary order
+template <typename T>
+class buffer_node
+    : public graph_node
+    , public reservable_item_buffer< T, cache_aligned_allocator<T> >
+    , public receiver<T>, public sender<T>
+{
+    typedef cache_aligned_allocator<T> internals_allocator;
+
+public:
+    typedef T input_type;
+    typedef T output_type;
+    typedef typename receiver<input_type>::predecessor_type predecessor_type;
+    typedef typename sender<output_type>::successor_type successor_type;
+    typedef buffer_node<T> class_type;
+
+protected:
+    typedef size_t size_type;
+    round_robin_cache< T, null_rw_mutex > my_successors;
+
+    friend class forward_task_bypass< class_type >;
+
+    enum op_type {reg_succ, rem_succ, req_item, res_item, rel_res, con_res, put_item, try_fwd_task
+    };
+
+    // implements the aggregator_operation concept
+    class buffer_operation : public aggregated_operation< buffer_operation > {
+    public:
+        char type;
+        T* elem;
+        graph_task* ltask;
+        successor_type *r;
+
+        buffer_operation(const T& e, op_type t) : type(char(t))
+                                                  , elem(const_cast<T*>(&e)) , ltask(NULL)
+        {}
+        buffer_operation(op_type t) : type(char(t)),  ltask(NULL) {}
+    };
+
+    bool forwarder_busy;
+    typedef aggregating_functor<class_type, buffer_operation> handler_type;
+    friend class aggregating_functor<class_type, buffer_operation>;
+    aggregator< handler_type, buffer_operation> my_aggregator;
+
+    virtual void handle_operations(buffer_operation *op_list) {
+        handle_operations_impl(op_list, this);
+    }
+
+    template<typename derived_type>
+    void handle_operations_impl(buffer_operation *op_list, derived_type* derived) {
+        __TBB_ASSERT(static_cast<class_type*>(derived) == this, "'this' is not a base class for derived");
+
+        buffer_operation *tmp = NULL;
+        bool try_forwarding = false;
+        while (op_list) {
+            tmp = op_list;
+            op_list = op_list->next;
+            switch (tmp->type) {
+            case reg_succ: internal_reg_succ(tmp); try_forwarding = true; break;
+            case rem_succ: internal_rem_succ(tmp); break;
+            case req_item: internal_pop(tmp); break;
+            case res_item: internal_reserve(tmp); break;
+            case rel_res:  internal_release(tmp); try_forwarding = true; break;
+            case con_res:  internal_consume(tmp); try_forwarding = true; break;
+            case put_item: try_forwarding = internal_push(tmp); break;
+            case try_fwd_task: internal_forward_task(tmp); break;
+            }
+        }
+
+        derived->order();
+
+        if (try_forwarding && !forwarder_busy) {
+            if(is_graph_active(this->my_graph)) {
+                forwarder_busy = true;
+                typedef forward_task_bypass<class_type> task_type;
+                small_object_allocator allocator{};
+                graph_task* new_task = allocator.new_object<task_type>(graph_reference(), allocator, *this);
+                my_graph.reserve_wait();
+                // tmp should point to the last item handled by the aggregator.  This is the operation
+                // the handling thread enqueued.  So modifying that record will be okay.
+                // TODO revamp: check that the issue is still present
+                // workaround for icc bug  (at least 12.0 and 13.0)
+                // error: function "tbb::flow::interfaceX::combine_tasks" cannot be called with the given argument list
+                //        argument types are: (graph, graph_task *, graph_task *)
+                graph_task *z = tmp->ltask;
+                graph &g = this->my_graph;
+                tmp->ltask = combine_tasks(g, z, new_task);  // in case the op generated a task
+            }
+        }
+    }  // handle_operations
+
+    inline graph_task *grab_forwarding_task( buffer_operation &op_data) {
+        return op_data.ltask;
+    }
+
+    inline bool enqueue_forwarding_task(buffer_operation &op_data) {
+        graph_task *ft = grab_forwarding_task(op_data);
+        if(ft) {
+            spawn_in_graph_arena(graph_reference(), *ft);
+            return true;
+        }
+        return false;
+    }
+
+    //! This is executed by an enqueued task, the "forwarder"
+    virtual graph_task *forward_task() {
+        buffer_operation op_data(try_fwd_task);
+        graph_task *last_task = NULL;
+        do {
+            op_data.status = WAIT;
+            op_data.ltask = NULL;
+            my_aggregator.execute(&op_data);
+
+            // workaround for icc bug
+            graph_task *xtask = op_data.ltask;
+            graph& g = this->my_graph;
+            last_task = combine_tasks(g, last_task, xtask);
+        } while (op_data.status ==SUCCEEDED);
+        return last_task;
+    }
+
+    //! Register successor
+    virtual void internal_reg_succ(buffer_operation *op) {
+        my_successors.register_successor(*(op->r));
+        op->status.store(SUCCEEDED, std::memory_order_release);
+    }
+
+    //! Remove successor
+    virtual void internal_rem_succ(buffer_operation *op) {
+        my_successors.remove_successor(*(op->r));
+        op->status.store(SUCCEEDED, std::memory_order_release);
+    }
+
+private:
+    void order() {}
+
+    bool is_item_valid() {
+        return this->my_item_valid(this->my_tail - 1);
+    }
+
+    void try_put_and_add_task(graph_task*& last_task) {
+        graph_task *new_task = my_successors.try_put_task(this->back());
+        if (new_task) {
+            // workaround for icc bug
+            graph& g = this->my_graph;
+            last_task = combine_tasks(g, last_task, new_task);
+            this->destroy_back();
+        }
+    }
+
+protected:
+    //! Tries to forward valid items to successors
+    virtual void internal_forward_task(buffer_operation *op) {
+        internal_forward_task_impl(op, this);
+    }
+
+    template<typename derived_type>
+    void internal_forward_task_impl(buffer_operation *op, derived_type* derived) {
+        __TBB_ASSERT(static_cast<class_type*>(derived) == this, "'this' is not a base class for derived");
+
+        if (this->my_reserved || !derived->is_item_valid()) {
+            op->status.store(FAILED, std::memory_order_release);
+            this->forwarder_busy = false;
+            return;
+        }
+        // Try forwarding, giving each successor a chance
+        graph_task* last_task = NULL;
+        size_type counter = my_successors.size();
+        for (; counter > 0 && derived->is_item_valid(); --counter)
+            derived->try_put_and_add_task(last_task);
+
+        op->ltask = last_task;  // return task
+        if (last_task && !counter) {
+            op->status.store(SUCCEEDED, std::memory_order_release);
+        }
+        else {
+            op->status.store(FAILED, std::memory_order_release);
+            forwarder_busy = false;
+        }
+    }
+
+    virtual bool internal_push(buffer_operation *op) {
+        this->push_back(*(op->elem));
+        op->status.store(SUCCEEDED, std::memory_order_release);
+        return true;
+    }
+
+    virtual void internal_pop(buffer_operation *op) {
+        if(this->pop_back(*(op->elem))) {
+            op->status.store(SUCCEEDED, std::memory_order_release);
+        }
+        else {
+            op->status.store(FAILED, std::memory_order_release);
+        }
+    }
+
+    virtual void internal_reserve(buffer_operation *op) {
+        if(this->reserve_front(*(op->elem))) {
+            op->status.store(SUCCEEDED, std::memory_order_release);
+        }
+        else {
+            op->status.store(FAILED, std::memory_order_release);
+        }
+    }
+
+    virtual void internal_consume(buffer_operation *op) {
+        this->consume_front();
+        op->status.store(SUCCEEDED, std::memory_order_release);
+    }
+
+    virtual void internal_release(buffer_operation *op) {
+        this->release_front();
+        op->status.store(SUCCEEDED, std::memory_order_release);
+    }
+
+public:
+    //! Constructor
+    __TBB_NOINLINE_SYM explicit buffer_node( graph &g )
+        : graph_node(g), reservable_item_buffer<T, internals_allocator>(), receiver<T>(),
+          sender<T>(), my_successors(this), forwarder_busy(false)
+    {
+        my_aggregator.initialize_handler(handler_type(this));
+        fgt_node( CODEPTR(), FLOW_BUFFER_NODE, &this->my_graph,
+                                 static_cast<receiver<input_type> *>(this), static_cast<sender<output_type> *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    buffer_node(const node_set<Args...>& nodes) : buffer_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    //! Copy constructor
+    __TBB_NOINLINE_SYM buffer_node( const buffer_node& src ) : buffer_node(src.my_graph) {}
+
+    //
+    // message sender implementation
+    //
+
+    //! Adds a new successor.
+    /** Adds successor r to the list of successors; may forward tasks.  */
+    bool register_successor( successor_type &r ) override {
+        buffer_operation op_data(reg_succ);
+        op_data.r = &r;
+        my_aggregator.execute(&op_data);
+        (void)enqueue_forwarding_task(op_data);
+        return true;
+    }
+
+    //! Removes a successor.
+    /** Removes successor r from the list of successors.
+        It also calls r.remove_predecessor(*this) to remove this node as a predecessor. */
+    bool remove_successor( successor_type &r ) override {
+        // TODO revamp: investigate why full qualification is necessary here
+        tbb::detail::d1::remove_predecessor(r, *this);
+        buffer_operation op_data(rem_succ);
+        op_data.r = &r;
+        my_aggregator.execute(&op_data);
+        // even though this operation does not cause a forward, if we are the handler, and
+        // a forward is scheduled, we may be the first to reach this point after the aggregator,
+        // and so should check for the task.
+        (void)enqueue_forwarding_task(op_data);
+        return true;
+    }
+
+    //! Request an item from the buffer_node
+    /**  true = v contains the returned item<BR>
+         false = no item has been returned */
+    bool try_get( T &v ) override {
+        buffer_operation op_data(req_item);
+        op_data.elem = &v;
+        my_aggregator.execute(&op_data);
+        (void)enqueue_forwarding_task(op_data);
+        return (op_data.status==SUCCEEDED);
+    }
+
+    //! Reserves an item.
+    /**  false = no item can be reserved<BR>
+         true = an item is reserved */
+    bool try_reserve( T &v ) override {
+        buffer_operation op_data(res_item);
+        op_data.elem = &v;
+        my_aggregator.execute(&op_data);
+        (void)enqueue_forwarding_task(op_data);
+        return (op_data.status==SUCCEEDED);
+    }
+
+    //! Release a reserved item.
+    /**  true = item has been released and so remains in sender */
+    bool try_release() override {
+        buffer_operation op_data(rel_res);
+        my_aggregator.execute(&op_data);
+        (void)enqueue_forwarding_task(op_data);
+        return true;
+    }
+
+    //! Consumes a reserved item.
+    /** true = item is removed from sender and reservation removed */
+    bool try_consume() override {
+        buffer_operation op_data(con_res);
+        my_aggregator.execute(&op_data);
+        (void)enqueue_forwarding_task(op_data);
+        return true;
+    }
+
+protected:
+
+    template< typename R, typename B > friend class run_and_put_task;
+    template<typename X, typename Y> friend class broadcast_cache;
+    template<typename X, typename Y> friend class round_robin_cache;
+    //! receive an item, return a task *if possible
+    graph_task *try_put_task(const T &t) override {
+        buffer_operation op_data(t, put_item);
+        my_aggregator.execute(&op_data);
+        graph_task *ft = grab_forwarding_task(op_data);
+        // sequencer_nodes can return failure (if an item has been previously inserted)
+        // We have to spawn the returned task if our own operation fails.
+
+        if(ft && op_data.status ==FAILED) {
+            // we haven't succeeded queueing the item, but for some reason the
+            // call returned a task (if another request resulted in a successful
+            // forward this could happen.)  Queue the task and reset the pointer.
+            spawn_in_graph_arena(graph_reference(), *ft); ft = NULL;
+        }
+        else if(!ft && op_data.status ==SUCCEEDED) {
+            ft = SUCCESSFULLY_ENQUEUED;
+        }
+        return ft;
+    }
+
+    graph& graph_reference() const override {
+        return my_graph;
+    }
+
+protected:
+    void reset_node( reset_flags f) override {
+        reservable_item_buffer<T, internals_allocator>::reset();
+        // TODO: just clear structures
+        if (f&rf_clear_edges) {
+            my_successors.clear();
+        }
+        forwarder_busy = false;
+    }
+};  // buffer_node
+
+//! Forwards messages in FIFO order
+template <typename T>
+class queue_node : public buffer_node<T> {
+protected:
+    typedef buffer_node<T> base_type;
+    typedef typename base_type::size_type size_type;
+    typedef typename base_type::buffer_operation queue_operation;
+    typedef queue_node class_type;
+
+private:
+    template<typename> friend class buffer_node;
+
+    bool is_item_valid() {
+        return this->my_item_valid(this->my_head);
+    }
+
+    void try_put_and_add_task(graph_task*& last_task) {
+        graph_task *new_task = this->my_successors.try_put_task(this->front());
+        if (new_task) {
+            // workaround for icc bug
+            graph& graph_ref = this->graph_reference();
+            last_task = combine_tasks(graph_ref, last_task, new_task);
+            this->destroy_front();
+        }
+    }
+
+protected:
+    void internal_forward_task(queue_operation *op) override {
+        this->internal_forward_task_impl(op, this);
+    }
+
+    void internal_pop(queue_operation *op) override {
+        if ( this->my_reserved || !this->my_item_valid(this->my_head)){
+            op->status.store(FAILED, std::memory_order_release);
+        }
+        else {
+            this->pop_front(*(op->elem));
+            op->status.store(SUCCEEDED, std::memory_order_release);
+        }
+    }
+    void internal_reserve(queue_operation *op) override {
+        if (this->my_reserved || !this->my_item_valid(this->my_head)) {
+            op->status.store(FAILED, std::memory_order_release);
+        }
+        else {
+            this->reserve_front(*(op->elem));
+            op->status.store(SUCCEEDED, std::memory_order_release);
+        }
+    }
+    void internal_consume(queue_operation *op) override {
+        this->consume_front();
+        op->status.store(SUCCEEDED, std::memory_order_release);
+    }
+
+public:
+    typedef T input_type;
+    typedef T output_type;
+    typedef typename receiver<input_type>::predecessor_type predecessor_type;
+    typedef typename sender<output_type>::successor_type successor_type;
+
+    //! Constructor
+    __TBB_NOINLINE_SYM explicit queue_node( graph &g ) : base_type(g) {
+        fgt_node( CODEPTR(), FLOW_QUEUE_NODE, &(this->my_graph),
+                                 static_cast<receiver<input_type> *>(this),
+                                 static_cast<sender<output_type> *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    queue_node( const node_set<Args...>& nodes) : queue_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    //! Copy constructor
+    __TBB_NOINLINE_SYM queue_node( const queue_node& src) : base_type(src) {
+        fgt_node( CODEPTR(), FLOW_QUEUE_NODE, &(this->my_graph),
+                                 static_cast<receiver<input_type> *>(this),
+                                 static_cast<sender<output_type> *>(this) );
+    }
+
+
+protected:
+    void reset_node( reset_flags f) override {
+        base_type::reset_node(f);
+    }
+};  // queue_node
+
+//! Forwards messages in sequence order
+template <typename T>
+class sequencer_node : public queue_node<T> {
+    function_body< T, size_t > *my_sequencer;
+    // my_sequencer should be a benign function and must be callable
+    // from a parallel context.  Does this mean it needn't be reset?
+public:
+    typedef T input_type;
+    typedef T output_type;
+    typedef typename receiver<input_type>::predecessor_type predecessor_type;
+    typedef typename sender<output_type>::successor_type successor_type;
+
+    //! Constructor
+    template< typename Sequencer >
+    __TBB_NOINLINE_SYM sequencer_node( graph &g, const Sequencer& s ) : queue_node<T>(g),
+        my_sequencer(new function_body_leaf< T, size_t, Sequencer>(s) ) {
+        fgt_node( CODEPTR(), FLOW_SEQUENCER_NODE, &(this->my_graph),
+                                 static_cast<receiver<input_type> *>(this),
+                                 static_cast<sender<output_type> *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename Sequencer, typename... Args>
+    sequencer_node( const node_set<Args...>& nodes, const Sequencer& s)
+        : sequencer_node(nodes.graph_reference(), s) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    //! Copy constructor
+    __TBB_NOINLINE_SYM sequencer_node( const sequencer_node& src ) : queue_node<T>(src),
+        my_sequencer( src.my_sequencer->clone() ) {
+        fgt_node( CODEPTR(), FLOW_SEQUENCER_NODE, &(this->my_graph),
+                                 static_cast<receiver<input_type> *>(this),
+                                 static_cast<sender<output_type> *>(this) );
+    }
+
+    //! Destructor
+    ~sequencer_node() { delete my_sequencer; }
+
+protected:
+    typedef typename buffer_node<T>::size_type size_type;
+    typedef typename buffer_node<T>::buffer_operation sequencer_operation;
+
+private:
+    bool internal_push(sequencer_operation *op) override {
+        size_type tag = (*my_sequencer)(*(op->elem));
+#if !TBB_DEPRECATED_SEQUENCER_DUPLICATES
+        if (tag < this->my_head) {
+            // have already emitted a message with this tag
+            op->status.store(FAILED, std::memory_order_release);
+            return false;
+        }
+#endif
+        // cannot modify this->my_tail now; the buffer would be inconsistent.
+        size_t new_tail = (tag+1 > this->my_tail) ? tag+1 : this->my_tail;
+
+        if (this->size(new_tail) > this->capacity()) {
+            this->grow_my_array(this->size(new_tail));
+        }
+        this->my_tail = new_tail;
+
+        const op_stat res = this->place_item(tag, *(op->elem)) ? SUCCEEDED : FAILED;
+        op->status.store(res, std::memory_order_release);
+        return res ==SUCCEEDED;
+    }
+};  // sequencer_node
+
+//! Forwards messages in priority order
+template<typename T, typename Compare = std::less<T>>
+class priority_queue_node : public buffer_node<T> {
+public:
+    typedef T input_type;
+    typedef T output_type;
+    typedef buffer_node<T> base_type;
+    typedef priority_queue_node class_type;
+    typedef typename receiver<input_type>::predecessor_type predecessor_type;
+    typedef typename sender<output_type>::successor_type successor_type;
+
+    //! Constructor
+    __TBB_NOINLINE_SYM explicit priority_queue_node( graph &g, const Compare& comp = Compare() )
+        : buffer_node<T>(g), compare(comp), mark(0) {
+        fgt_node( CODEPTR(), FLOW_PRIORITY_QUEUE_NODE, &(this->my_graph),
+                                 static_cast<receiver<input_type> *>(this),
+                                 static_cast<sender<output_type> *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    priority_queue_node(const node_set<Args...>& nodes, const Compare& comp = Compare())
+        : priority_queue_node(nodes.graph_reference(), comp) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    //! Copy constructor
+    __TBB_NOINLINE_SYM priority_queue_node( const priority_queue_node &src )
+        : buffer_node<T>(src), mark(0)
+    {
+        fgt_node( CODEPTR(), FLOW_PRIORITY_QUEUE_NODE, &(this->my_graph),
+                                 static_cast<receiver<input_type> *>(this),
+                                 static_cast<sender<output_type> *>(this) );
+    }
+
+protected:
+
+    void reset_node( reset_flags f) override {
+        mark = 0;
+        base_type::reset_node(f);
+    }
+
+    typedef typename buffer_node<T>::size_type size_type;
+    typedef typename buffer_node<T>::item_type item_type;
+    typedef typename buffer_node<T>::buffer_operation prio_operation;
+
+    //! Tries to forward valid items to successors
+    void internal_forward_task(prio_operation *op) override {
+        this->internal_forward_task_impl(op, this);
+    }
+
+    void handle_operations(prio_operation *op_list) override {
+        this->handle_operations_impl(op_list, this);
+    }
+
+    bool internal_push(prio_operation *op) override {
+        prio_push(*(op->elem));
+        op->status.store(SUCCEEDED, std::memory_order_release);
+        return true;
+    }
+
+    void internal_pop(prio_operation *op) override {
+        // if empty or already reserved, don't pop
+        if ( this->my_reserved == true || this->my_tail == 0 ) {
+            op->status.store(FAILED, std::memory_order_release);
+            return;
+        }
+
+        *(op->elem) = prio();
+        op->status.store(SUCCEEDED, std::memory_order_release);
+        prio_pop();
+
+    }
+
+    // pops the highest-priority item, saves copy
+    void internal_reserve(prio_operation *op) override {
+        if (this->my_reserved == true || this->my_tail == 0) {
+            op->status.store(FAILED, std::memory_order_release);
+            return;
+        }
+        this->my_reserved = true;
+        *(op->elem) = prio();
+        reserved_item = *(op->elem);
+        op->status.store(SUCCEEDED, std::memory_order_release);
+        prio_pop();
+    }
+
+    void internal_consume(prio_operation *op) override {
+        op->status.store(SUCCEEDED, std::memory_order_release);
+        this->my_reserved = false;
+        reserved_item = input_type();
+    }
+
+    void internal_release(prio_operation *op) override {
+        op->status.store(SUCCEEDED, std::memory_order_release);
+        prio_push(reserved_item);
+        this->my_reserved = false;
+        reserved_item = input_type();
+    }
+
+private:
+    template<typename> friend class buffer_node;
+
+    void order() {
+        if (mark < this->my_tail) heapify();
+        __TBB_ASSERT(mark == this->my_tail, "mark unequal after heapify");
+    }
+
+    bool is_item_valid() {
+        return this->my_tail > 0;
+    }
+
+    void try_put_and_add_task(graph_task*& last_task) {
+        graph_task * new_task = this->my_successors.try_put_task(this->prio());
+        if (new_task) {
+            // workaround for icc bug
+            graph& graph_ref = this->graph_reference();
+            last_task = combine_tasks(graph_ref, last_task, new_task);
+            prio_pop();
+        }
+    }
+
+private:
+    Compare compare;
+    size_type mark;
+
+    input_type reserved_item;
+
+    // in case a reheap has not been done after a push, check if the mark item is higher than the 0'th item
+    bool prio_use_tail() {
+        __TBB_ASSERT(mark <= this->my_tail, "mark outside bounds before test");
+        return mark < this->my_tail && compare(this->get_my_item(0), this->get_my_item(this->my_tail - 1));
+    }
+
+    // prio_push: checks that the item will fit, expand array if necessary, put at end
+    void prio_push(const T &src) {
+        if ( this->my_tail >= this->my_array_size )
+            this->grow_my_array( this->my_tail + 1 );
+        (void) this->place_item(this->my_tail, src);
+        ++(this->my_tail);
+        __TBB_ASSERT(mark < this->my_tail, "mark outside bounds after push");
+    }
+
+    // prio_pop: deletes highest priority item from the array, and if it is item
+    // 0, move last item to 0 and reheap.  If end of array, just destroy and decrement tail
+    // and mark.  Assumes the array has already been tested for emptiness; no failure.
+    void prio_pop()  {
+        if (prio_use_tail()) {
+            // there are newly pushed elements; last one higher than top
+            // copy the data
+            this->destroy_item(this->my_tail-1);
+            --(this->my_tail);
+            __TBB_ASSERT(mark <= this->my_tail, "mark outside bounds after pop");
+            return;
+        }
+        this->destroy_item(0);
+        if(this->my_tail > 1) {
+            // push the last element down heap
+            __TBB_ASSERT(this->my_item_valid(this->my_tail - 1), NULL);
+            this->move_item(0,this->my_tail - 1);
+        }
+        --(this->my_tail);
+        if(mark > this->my_tail) --mark;
+        if (this->my_tail > 1) // don't reheap for heap of size 1
+            reheap();
+        __TBB_ASSERT(mark <= this->my_tail, "mark outside bounds after pop");
+    }
+
+    const T& prio() {
+        return this->get_my_item(prio_use_tail() ? this->my_tail-1 : 0);
+    }
+
+    // turn array into heap
+    void heapify() {
+        if(this->my_tail == 0) {
+            mark = 0;
+            return;
+        }
+        if (!mark) mark = 1;
+        for (; mark<this->my_tail; ++mark) { // for each unheaped element
+            size_type cur_pos = mark;
+            input_type to_place;
+            this->fetch_item(mark,to_place);
+            do { // push to_place up the heap
+                size_type parent = (cur_pos-1)>>1;
+                if (!compare(this->get_my_item(parent), to_place))
+                    break;
+                this->move_item(cur_pos, parent);
+                cur_pos = parent;
+            } while( cur_pos );
+            (void) this->place_item(cur_pos, to_place);
+        }
+    }
+
+    // otherwise heapified array with new root element; rearrange to heap
+    void reheap() {
+        size_type cur_pos=0, child=1;
+        while (child < mark) {
+            size_type target = child;
+            if (child+1<mark &&
+                compare(this->get_my_item(child),
+                        this->get_my_item(child+1)))
+                ++target;
+            // target now has the higher priority child
+            if (compare(this->get_my_item(target),
+                        this->get_my_item(cur_pos)))
+                break;
+            // swap
+            this->swap_items(cur_pos, target);
+            cur_pos = target;
+            child = (cur_pos<<1)+1;
+        }
+    }
+};  // priority_queue_node
+
+//! Forwards messages only if the threshold has not been reached
+/** This node forwards items until its threshold is reached.
+    It contains no buffering.  If the downstream node rejects, the
+    message is dropped. */
+template< typename T, typename DecrementType=continue_msg >
+class limiter_node : public graph_node, public receiver< T >, public sender< T > {
+public:
+    typedef T input_type;
+    typedef T output_type;
+    typedef typename receiver<input_type>::predecessor_type predecessor_type;
+    typedef typename sender<output_type>::successor_type successor_type;
+    //TODO: There is a lack of predefined types for its controlling "decrementer" port. It should be fixed later.
+
+private:
+    size_t my_threshold;
+    size_t my_count; // number of successful puts
+    size_t my_tries; // number of active put attempts
+    reservable_predecessor_cache< T, spin_mutex > my_predecessors;
+    spin_mutex my_mutex;
+    broadcast_cache< T > my_successors;
+
+    //! The internal receiver< DecrementType > that adjusts the count
+    threshold_regulator< limiter_node<T, DecrementType>, DecrementType > decrement;
+
+    graph_task* decrement_counter( long long delta ) {
+        {
+            spin_mutex::scoped_lock lock(my_mutex);
+            if( delta > 0 && size_t(delta) > my_count )
+                my_count = 0;
+            else if( delta < 0 && size_t(delta) > my_threshold - my_count )
+                my_count = my_threshold;
+            else
+                my_count -= size_t(delta); // absolute value of delta is sufficiently small
+        }
+        return forward_task();
+    }
+
+    // Let threshold_regulator call decrement_counter()
+    friend class threshold_regulator< limiter_node<T, DecrementType>, DecrementType >;
+
+    friend class forward_task_bypass< limiter_node<T,DecrementType> >;
+
+    bool check_conditions() {  // always called under lock
+        return ( my_count + my_tries < my_threshold && !my_predecessors.empty() && !my_successors.empty() );
+    }
+
+    // only returns a valid task pointer or NULL, never SUCCESSFULLY_ENQUEUED
+    graph_task* forward_task() {
+        input_type v;
+        graph_task* rval = NULL;
+        bool reserved = false;
+            {
+                spin_mutex::scoped_lock lock(my_mutex);
+                if ( check_conditions() )
+                    ++my_tries;
+                else
+                    return NULL;
+            }
+
+        //SUCCESS
+        // if we can reserve and can put, we consume the reservation
+        // we increment the count and decrement the tries
+        if ( (my_predecessors.try_reserve(v)) == true ){
+            reserved=true;
+            if ( (rval = my_successors.try_put_task(v)) != NULL ){
+                {
+                    spin_mutex::scoped_lock lock(my_mutex);
+                    ++my_count;
+                    --my_tries;
+                    my_predecessors.try_consume();
+                    if ( check_conditions() ) {
+                        if ( is_graph_active(this->my_graph) ) {
+                            typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
+                            small_object_allocator allocator{};
+                            graph_task* rtask = allocator.new_object<task_type>( my_graph, allocator, *this );
+                            my_graph.reserve_wait();
+                            spawn_in_graph_arena(graph_reference(), *rtask);
+                        }
+                    }
+                }
+                return rval;
+            }
+        }
+        //FAILURE
+        //if we can't reserve, we decrement the tries
+        //if we can reserve but can't put, we decrement the tries and release the reservation
+        {
+            spin_mutex::scoped_lock lock(my_mutex);
+            --my_tries;
+            if (reserved) my_predecessors.try_release();
+            if ( check_conditions() ) {
+                if ( is_graph_active(this->my_graph) ) {
+                    small_object_allocator allocator{};
+                    typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
+                    graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
+                    my_graph.reserve_wait();
+                    __TBB_ASSERT(!rval, "Have two tasks to handle");
+                    return t;
+                }
+            }
+            return rval;
+        }
+    }
+
+    void initialize() {
+        fgt_node(
+            CODEPTR(), FLOW_LIMITER_NODE, &this->my_graph,
+            static_cast<receiver<input_type> *>(this), static_cast<receiver<DecrementType> *>(&decrement),
+            static_cast<sender<output_type> *>(this)
+        );
+    }
+
+public:
+    //! Constructor
+    limiter_node(graph &g, size_t threshold)
+        : graph_node(g), my_threshold(threshold), my_count(0), my_tries(0), my_predecessors(this)
+        , my_successors(this), decrement(this)
+    {
+        initialize();
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    limiter_node(const node_set<Args...>& nodes, size_t threshold)
+        : limiter_node(nodes.graph_reference(), threshold) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    //! Copy constructor
+    limiter_node( const limiter_node& src ) : limiter_node(src.my_graph, src.my_threshold) {}
+
+    //! The interface for accessing internal receiver< DecrementType > that adjusts the count
+    receiver<DecrementType>& decrementer() { return decrement; }
+
+    //! Replace the current successor with this new successor
+    bool register_successor( successor_type &r ) override {
+        spin_mutex::scoped_lock lock(my_mutex);
+        bool was_empty = my_successors.empty();
+        my_successors.register_successor(r);
+        //spawn a forward task if this is the only successor
+        if ( was_empty && !my_predecessors.empty() && my_count + my_tries < my_threshold ) {
+            if ( is_graph_active(this->my_graph) ) {
+                small_object_allocator allocator{};
+                typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
+                graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
+                my_graph.reserve_wait();
+                spawn_in_graph_arena(graph_reference(), *t);
+            }
+        }
+        return true;
+    }
+
+    //! Removes a successor from this node
+    /** r.remove_predecessor(*this) is also called. */
+    bool remove_successor( successor_type &r ) override {
+        // TODO revamp: investigate why qualification is needed for remove_predecessor() call
+        tbb::detail::d1::remove_predecessor(r, *this);
+        my_successors.remove_successor(r);
+        return true;
+    }
+
+    //! Adds src to the list of cached predecessors.
+    bool register_predecessor( predecessor_type &src ) override {
+        spin_mutex::scoped_lock lock(my_mutex);
+        my_predecessors.add( src );
+        if ( my_count + my_tries < my_threshold && !my_successors.empty() && is_graph_active(this->my_graph) ) {
+            small_object_allocator allocator{};
+            typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
+            graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
+            my_graph.reserve_wait();
+            spawn_in_graph_arena(graph_reference(), *t);
+        }
+        return true;
+    }
+
+    //! Removes src from the list of cached predecessors.
+    bool remove_predecessor( predecessor_type &src ) override {
+        my_predecessors.remove( src );
+        return true;
+    }
+
+protected:
+
+    template< typename R, typename B > friend class run_and_put_task;
+    template<typename X, typename Y> friend class broadcast_cache;
+    template<typename X, typename Y> friend class round_robin_cache;
+    //! Puts an item to this receiver
+    graph_task* try_put_task( const T &t ) override {
+        {
+            spin_mutex::scoped_lock lock(my_mutex);
+            if ( my_count + my_tries >= my_threshold )
+                return NULL;
+            else
+                ++my_tries;
+        }
+
+        graph_task* rtask = my_successors.try_put_task(t);
+
+        if ( !rtask ) {  // try_put_task failed.
+            spin_mutex::scoped_lock lock(my_mutex);
+            --my_tries;
+            if (check_conditions() && is_graph_active(this->my_graph)) {
+                small_object_allocator allocator{};
+                typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
+                rtask = allocator.new_object<task_type>(my_graph, allocator, *this);
+                my_graph.reserve_wait();
+            }
+        }
+        else {
+            spin_mutex::scoped_lock lock(my_mutex);
+            ++my_count;
+            --my_tries;
+             }
+        return rtask;
+    }
+
+    graph& graph_reference() const override { return my_graph; }
+
+    void reset_node( reset_flags f) override {
+        my_count = 0;
+        if(f & rf_clear_edges) {
+            my_predecessors.clear();
+            my_successors.clear();
+        }
+        else
+        {
+            my_predecessors.reset( );
+        }
+        decrement.reset_receiver(f);
+    }
+};  // limiter_node
+
+#include "detail/_flow_graph_join_impl.h"
+
+template<typename OutputTuple, typename JP=queueing> class join_node;
+
+template<typename OutputTuple>
+class join_node<OutputTuple,reserving>: public unfolded_join_node<std::tuple_size<OutputTuple>::value, reserving_port, OutputTuple, reserving> {
+private:
+    static const int N = std::tuple_size<OutputTuple>::value;
+    typedef unfolded_join_node<N, reserving_port, OutputTuple, reserving> unfolded_type;
+public:
+    typedef OutputTuple output_type;
+    typedef typename unfolded_type::input_ports_type input_ports_type;
+     __TBB_NOINLINE_SYM explicit join_node(graph &g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_RESERVING, &this->my_graph,
+                                            this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    __TBB_NOINLINE_SYM join_node(const node_set<Args...>& nodes, reserving = reserving()) : join_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    __TBB_NOINLINE_SYM join_node(const join_node &other) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_RESERVING, &this->my_graph,
+                                            this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+
+template<typename OutputTuple>
+class join_node<OutputTuple,queueing>: public unfolded_join_node<std::tuple_size<OutputTuple>::value, queueing_port, OutputTuple, queueing> {
+private:
+    static const int N = std::tuple_size<OutputTuple>::value;
+    typedef unfolded_join_node<N, queueing_port, OutputTuple, queueing> unfolded_type;
+public:
+    typedef OutputTuple output_type;
+    typedef typename unfolded_type::input_ports_type input_ports_type;
+     __TBB_NOINLINE_SYM explicit join_node(graph &g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_QUEUEING, &this->my_graph,
+                                            this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    __TBB_NOINLINE_SYM join_node(const node_set<Args...>& nodes, queueing = queueing()) : join_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    __TBB_NOINLINE_SYM join_node(const join_node &other) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_QUEUEING, &this->my_graph,
+                                            this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+
+// template for key_matching join_node
+// tag_matching join_node is a specialization of key_matching, and is source-compatible.
+template<typename OutputTuple, typename K, typename KHash>
+class join_node<OutputTuple, key_matching<K, KHash> > : public unfolded_join_node<std::tuple_size<OutputTuple>::value,
+      key_matching_port, OutputTuple, key_matching<K,KHash> > {
+private:
+    static const int N = std::tuple_size<OutputTuple>::value;
+    typedef unfolded_join_node<N, key_matching_port, OutputTuple, key_matching<K,KHash> > unfolded_type;
+public:
+    typedef OutputTuple output_type;
+    typedef typename unfolded_type::input_ports_type input_ports_type;
+
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+    join_node(graph &g) : unfolded_type(g) {}
+#endif  /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+
+    template<typename __TBB_B0, typename __TBB_B1>
+     __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1) : unfolded_type(g, b0, b1) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph,
+                                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+    template<typename __TBB_B0, typename __TBB_B1, typename __TBB_B2>
+     __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2) : unfolded_type(g, b0, b1, b2) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph,
+                                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+    template<typename __TBB_B0, typename __TBB_B1, typename __TBB_B2, typename __TBB_B3>
+     __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3) : unfolded_type(g, b0, b1, b2, b3) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph,
+                                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+    template<typename __TBB_B0, typename __TBB_B1, typename __TBB_B2, typename __TBB_B3, typename __TBB_B4>
+     __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4) :
+            unfolded_type(g, b0, b1, b2, b3, b4) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph,
+                                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+#if __TBB_VARIADIC_MAX >= 6
+    template<typename __TBB_B0, typename __TBB_B1, typename __TBB_B2, typename __TBB_B3, typename __TBB_B4,
+        typename __TBB_B5>
+     __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4, __TBB_B5 b5) :
+            unfolded_type(g, b0, b1, b2, b3, b4, b5) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph,
+                                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+#endif
+#if __TBB_VARIADIC_MAX >= 7
+    template<typename __TBB_B0, typename __TBB_B1, typename __TBB_B2, typename __TBB_B3, typename __TBB_B4,
+        typename __TBB_B5, typename __TBB_B6>
+     __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4, __TBB_B5 b5, __TBB_B6 b6) :
+            unfolded_type(g, b0, b1, b2, b3, b4, b5, b6) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph,
+                                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+#endif
+#if __TBB_VARIADIC_MAX >= 8
+    template<typename __TBB_B0, typename __TBB_B1, typename __TBB_B2, typename __TBB_B3, typename __TBB_B4,
+        typename __TBB_B5, typename __TBB_B6, typename __TBB_B7>
+     __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4, __TBB_B5 b5, __TBB_B6 b6,
+            __TBB_B7 b7) : unfolded_type(g, b0, b1, b2, b3, b4, b5, b6, b7) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph,
+                                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+#endif
+#if __TBB_VARIADIC_MAX >= 9
+    template<typename __TBB_B0, typename __TBB_B1, typename __TBB_B2, typename __TBB_B3, typename __TBB_B4,
+        typename __TBB_B5, typename __TBB_B6, typename __TBB_B7, typename __TBB_B8>
+     __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4, __TBB_B5 b5, __TBB_B6 b6,
+            __TBB_B7 b7, __TBB_B8 b8) : unfolded_type(g, b0, b1, b2, b3, b4, b5, b6, b7, b8) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph,
+                                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+#endif
+#if __TBB_VARIADIC_MAX >= 10
+    template<typename __TBB_B0, typename __TBB_B1, typename __TBB_B2, typename __TBB_B3, typename __TBB_B4,
+        typename __TBB_B5, typename __TBB_B6, typename __TBB_B7, typename __TBB_B8, typename __TBB_B9>
+     __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4, __TBB_B5 b5, __TBB_B6 b6,
+            __TBB_B7 b7, __TBB_B8 b8, __TBB_B9 b9) : unfolded_type(g, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph,
+                                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+#endif
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <
+#if (__clang_major__ == 3 && __clang_minor__ == 4)
+        // clang 3.4 misdeduces 'Args...' for 'node_set' while it can cope with template template parameter.
+        template<typename...> class node_set,
+#endif
+        typename... Args, typename... Bodies
+    >
+    __TBB_NOINLINE_SYM join_node(const node_set<Args...>& nodes, Bodies... bodies)
+        : join_node(nodes.graph_reference(), bodies...) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    __TBB_NOINLINE_SYM join_node(const join_node &other) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph,
+                                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+
+// indexer node
+#include "detail/_flow_graph_indexer_impl.h"
+
+// TODO: Implement interface with variadic template or tuple
+template<typename T0, typename T1=null_type, typename T2=null_type, typename T3=null_type,
+                      typename T4=null_type, typename T5=null_type, typename T6=null_type,
+                      typename T7=null_type, typename T8=null_type, typename T9=null_type> class indexer_node;
+
+//indexer node specializations
+template<typename T0>
+class indexer_node<T0> : public unfolded_indexer_node<std::tuple<T0> > {
+private:
+    static const int N = 1;
+public:
+    typedef std::tuple<T0> InputTuple;
+    typedef tagged_msg<size_t, T0> output_type;
+    typedef unfolded_indexer_node<InputTuple> unfolded_type;
+    __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    indexer_node(const node_set<Args...>& nodes) : indexer_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    // Copy constructor
+    __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+};
+
+template<typename T0, typename T1>
+class indexer_node<T0, T1> : public unfolded_indexer_node<std::tuple<T0, T1> > {
+private:
+    static const int N = 2;
+public:
+    typedef std::tuple<T0, T1> InputTuple;
+    typedef tagged_msg<size_t, T0, T1> output_type;
+    typedef unfolded_indexer_node<InputTuple> unfolded_type;
+    __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    indexer_node(const node_set<Args...>& nodes) : indexer_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    // Copy constructor
+    __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+
+template<typename T0, typename T1, typename T2>
+class indexer_node<T0, T1, T2> : public unfolded_indexer_node<std::tuple<T0, T1, T2> > {
+private:
+    static const int N = 3;
+public:
+    typedef std::tuple<T0, T1, T2> InputTuple;
+    typedef tagged_msg<size_t, T0, T1, T2> output_type;
+    typedef unfolded_indexer_node<InputTuple> unfolded_type;
+    __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    indexer_node(const node_set<Args...>& nodes) : indexer_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    // Copy constructor
+    __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+
+template<typename T0, typename T1, typename T2, typename T3>
+class indexer_node<T0, T1, T2, T3> : public unfolded_indexer_node<std::tuple<T0, T1, T2, T3> > {
+private:
+    static const int N = 4;
+public:
+    typedef std::tuple<T0, T1, T2, T3> InputTuple;
+    typedef tagged_msg<size_t, T0, T1, T2, T3> output_type;
+    typedef unfolded_indexer_node<InputTuple> unfolded_type;
+    __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    indexer_node(const node_set<Args...>& nodes) : indexer_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    // Copy constructor
+    __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4>
+class indexer_node<T0, T1, T2, T3, T4> : public unfolded_indexer_node<std::tuple<T0, T1, T2, T3, T4> > {
+private:
+    static const int N = 5;
+public:
+    typedef std::tuple<T0, T1, T2, T3, T4> InputTuple;
+    typedef tagged_msg<size_t, T0, T1, T2, T3, T4> output_type;
+    typedef unfolded_indexer_node<InputTuple> unfolded_type;
+    __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    indexer_node(const node_set<Args...>& nodes) : indexer_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    // Copy constructor
+    __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+
+#if __TBB_VARIADIC_MAX >= 6
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
+class indexer_node<T0, T1, T2, T3, T4, T5> : public unfolded_indexer_node<std::tuple<T0, T1, T2, T3, T4, T5> > {
+private:
+    static const int N = 6;
+public:
+    typedef std::tuple<T0, T1, T2, T3, T4, T5> InputTuple;
+    typedef tagged_msg<size_t, T0, T1, T2, T3, T4, T5> output_type;
+    typedef unfolded_indexer_node<InputTuple> unfolded_type;
+    __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    indexer_node(const node_set<Args...>& nodes) : indexer_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    // Copy constructor
+    __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+#endif //variadic max 6
+
+#if __TBB_VARIADIC_MAX >= 7
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+         typename T6>
+class indexer_node<T0, T1, T2, T3, T4, T5, T6> : public unfolded_indexer_node<std::tuple<T0, T1, T2, T3, T4, T5, T6> > {
+private:
+    static const int N = 7;
+public:
+    typedef std::tuple<T0, T1, T2, T3, T4, T5, T6> InputTuple;
+    typedef tagged_msg<size_t, T0, T1, T2, T3, T4, T5, T6> output_type;
+    typedef unfolded_indexer_node<InputTuple> unfolded_type;
+    __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    indexer_node(const node_set<Args...>& nodes) : indexer_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    // Copy constructor
+    __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+#endif //variadic max 7
+
+#if __TBB_VARIADIC_MAX >= 8
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+         typename T6, typename T7>
+class indexer_node<T0, T1, T2, T3, T4, T5, T6, T7> : public unfolded_indexer_node<std::tuple<T0, T1, T2, T3, T4, T5, T6, T7> > {
+private:
+    static const int N = 8;
+public:
+    typedef std::tuple<T0, T1, T2, T3, T4, T5, T6, T7> InputTuple;
+    typedef tagged_msg<size_t, T0, T1, T2, T3, T4, T5, T6, T7> output_type;
+    typedef unfolded_indexer_node<InputTuple> unfolded_type;
+    indexer_node(graph& g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    indexer_node(const node_set<Args...>& nodes) : indexer_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    // Copy constructor
+    indexer_node( const indexer_node& other ) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+#endif //variadic max 8
+
+#if __TBB_VARIADIC_MAX >= 9
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+         typename T6, typename T7, typename T8>
+class indexer_node<T0, T1, T2, T3, T4, T5, T6, T7, T8> : public unfolded_indexer_node<std::tuple<T0, T1, T2, T3, T4, T5, T6, T7, T8> > {
+private:
+    static const int N = 9;
+public:
+    typedef std::tuple<T0, T1, T2, T3, T4, T5, T6, T7, T8> InputTuple;
+    typedef tagged_msg<size_t, T0, T1, T2, T3, T4, T5, T6, T7, T8> output_type;
+    typedef unfolded_indexer_node<InputTuple> unfolded_type;
+    __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    indexer_node(const node_set<Args...>& nodes) : indexer_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    // Copy constructor
+    __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+#endif //variadic max 9
+
+#if __TBB_VARIADIC_MAX >= 10
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+         typename T6, typename T7, typename T8, typename T9>
+class indexer_node/*default*/ : public unfolded_indexer_node<std::tuple<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9> > {
+private:
+    static const int N = 10;
+public:
+    typedef std::tuple<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9> InputTuple;
+    typedef tagged_msg<size_t, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9> output_type;
+    typedef unfolded_indexer_node<InputTuple> unfolded_type;
+    __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    indexer_node(const node_set<Args...>& nodes) : indexer_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    // Copy constructor
+    __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+#endif //variadic max 10
+
+template< typename T >
+inline void internal_make_edge( sender<T> &p, receiver<T> &s ) {
+    register_successor(p, s);
+    fgt_make_edge( &p, &s );
+}
+
+//! Makes an edge between a single predecessor and a single successor
+template< typename T >
+inline void make_edge( sender<T> &p, receiver<T> &s ) {
+    internal_make_edge( p, s );
+}
+
+//Makes an edge from port 0 of a multi-output predecessor to port 0 of a multi-input successor.
+template< typename T, typename V,
+          typename = typename T::output_ports_type, typename = typename V::input_ports_type >
+inline void make_edge( T& output, V& input) {
+    make_edge(std::get<0>(output.output_ports()), std::get<0>(input.input_ports()));
+}
+
+//Makes an edge from port 0 of a multi-output predecessor to a receiver.
+template< typename T, typename R,
+          typename = typename T::output_ports_type >
+inline void make_edge( T& output, receiver<R>& input) {
+     make_edge(std::get<0>(output.output_ports()), input);
+}
+
+//Makes an edge from a sender to port 0 of a multi-input successor.
+template< typename S,  typename V,
+          typename = typename V::input_ports_type >
+inline void make_edge( sender<S>& output, V& input) {
+     make_edge(output, std::get<0>(input.input_ports()));
+}
+
+template< typename T >
+inline void internal_remove_edge( sender<T> &p, receiver<T> &s ) {
+    remove_successor( p, s );
+    fgt_remove_edge( &p, &s );
+}
+
+//! Removes an edge between a single predecessor and a single successor
+template< typename T >
+inline void remove_edge( sender<T> &p, receiver<T> &s ) {
+    internal_remove_edge( p, s );
+}
+
+//Removes an edge between port 0 of a multi-output predecessor and port 0 of a multi-input successor.
+template< typename T, typename V,
+          typename = typename T::output_ports_type, typename = typename V::input_ports_type >
+inline void remove_edge( T& output, V& input) {
+    remove_edge(std::get<0>(output.output_ports()), std::get<0>(input.input_ports()));
+}
+
+//Removes an edge between port 0 of a multi-output predecessor and a receiver.
+template< typename T, typename R,
+          typename = typename T::output_ports_type >
+inline void remove_edge( T& output, receiver<R>& input) {
+     remove_edge(std::get<0>(output.output_ports()), input);
+}
+//Removes an edge between a sender and port 0 of a multi-input successor.
+template< typename S,  typename V,
+          typename = typename V::input_ports_type >
+inline void remove_edge( sender<S>& output, V& input) {
+     remove_edge(output, std::get<0>(input.input_ports()));
+}
+
+//! Returns a copy of the body from a function or continue node
+template< typename Body, typename Node >
+Body copy_body( Node &n ) {
+    return n.template copy_function_object<Body>();
+}
+
+//composite_node
+template< typename InputTuple, typename OutputTuple > class composite_node;
+
+template< typename... InputTypes, typename... OutputTypes>
+class composite_node <std::tuple<InputTypes...>, std::tuple<OutputTypes...> > : public graph_node {
+
+public:
+    typedef std::tuple< receiver<InputTypes>&... > input_ports_type;
+    typedef std::tuple< sender<OutputTypes>&... > output_ports_type;
+
+private:
+    std::unique_ptr<input_ports_type> my_input_ports;
+    std::unique_ptr<output_ports_type> my_output_ports;
+
+    static const size_t NUM_INPUTS = sizeof...(InputTypes);
+    static const size_t NUM_OUTPUTS = sizeof...(OutputTypes);
+
+protected:
+    void reset_node(reset_flags) override {}
+
+public:
+    composite_node( graph &g ) : graph_node(g) {
+        fgt_multiinput_multioutput_node( CODEPTR(), FLOW_COMPOSITE_NODE, this, &this->my_graph );
+    }
+
+    template<typename T1, typename T2>
+    void set_external_ports(T1&& input_ports_tuple, T2&& output_ports_tuple) {
+        static_assert(NUM_INPUTS == std::tuple_size<input_ports_type>::value, "number of arguments does not match number of input ports");
+        static_assert(NUM_OUTPUTS == std::tuple_size<output_ports_type>::value, "number of arguments does not match number of output ports");
+
+        fgt_internal_input_alias_helper<T1, NUM_INPUTS>::alias_port( this, input_ports_tuple);
+        fgt_internal_output_alias_helper<T2, NUM_OUTPUTS>::alias_port( this, output_ports_tuple);
+
+        my_input_ports.reset( new input_ports_type(std::forward<T1>(input_ports_tuple)) );
+        my_output_ports.reset( new output_ports_type(std::forward<T2>(output_ports_tuple)) );
+    }
+
+    template< typename... NodeTypes >
+    void add_visible_nodes(const NodeTypes&... n) { add_nodes_impl(this, true, n...); }
+
+    template< typename... NodeTypes >
+    void add_nodes(const NodeTypes&... n) { add_nodes_impl(this, false, n...); }
+
+
+    input_ports_type& input_ports() {
+         __TBB_ASSERT(my_input_ports, "input ports not set, call set_external_ports to set input ports");
+         return *my_input_ports;
+    }
+
+    output_ports_type& output_ports() {
+         __TBB_ASSERT(my_output_ports, "output ports not set, call set_external_ports to set output ports");
+         return *my_output_ports;
+    }
+};  // class composite_node
+
+//composite_node with only input ports
+template< typename... InputTypes>
+class composite_node <std::tuple<InputTypes...>, std::tuple<> > : public graph_node {
+public:
+    typedef std::tuple< receiver<InputTypes>&... > input_ports_type;
+
+private:
+    std::unique_ptr<input_ports_type> my_input_ports;
+    static const size_t NUM_INPUTS = sizeof...(InputTypes);
+
+protected:
+    void reset_node(reset_flags) override {}
+
+public:
+    composite_node( graph &g ) : graph_node(g) {
+        fgt_composite( CODEPTR(), this, &g );
+    }
+
+   template<typename T>
+   void set_external_ports(T&& input_ports_tuple) {
+       static_assert(NUM_INPUTS == std::tuple_size<input_ports_type>::value, "number of arguments does not match number of input ports");
+
+       fgt_internal_input_alias_helper<T, NUM_INPUTS>::alias_port( this, input_ports_tuple);
+
+       my_input_ports.reset( new input_ports_type(std::forward<T>(input_ports_tuple)) );
+   }
+
+    template< typename... NodeTypes >
+    void add_visible_nodes(const NodeTypes&... n) { add_nodes_impl(this, true, n...); }
+
+    template< typename... NodeTypes >
+    void add_nodes( const NodeTypes&... n) { add_nodes_impl(this, false, n...); }
+
+
+    input_ports_type& input_ports() {
+         __TBB_ASSERT(my_input_ports, "input ports not set, call set_external_ports to set input ports");
+         return *my_input_ports;
+    }
+
+};  // class composite_node
+
+//composite_nodes with only output_ports
+template<typename... OutputTypes>
+class composite_node <std::tuple<>, std::tuple<OutputTypes...> > : public graph_node {
+public:
+    typedef std::tuple< sender<OutputTypes>&... > output_ports_type;
+
+private:
+    std::unique_ptr<output_ports_type> my_output_ports;
+    static const size_t NUM_OUTPUTS = sizeof...(OutputTypes);
+
+protected:
+    void reset_node(reset_flags) override {}
+
+public:
+    __TBB_NOINLINE_SYM composite_node( graph &g ) : graph_node(g) {
+        fgt_composite( CODEPTR(), this, &g );
+    }
+
+   template<typename T>
+   void set_external_ports(T&& output_ports_tuple) {
+       static_assert(NUM_OUTPUTS == std::tuple_size<output_ports_type>::value, "number of arguments does not match number of output ports");
+
+       fgt_internal_output_alias_helper<T, NUM_OUTPUTS>::alias_port( this, output_ports_tuple);
+
+       my_output_ports.reset( new output_ports_type(std::forward<T>(output_ports_tuple)) );
+   }
+
+    template<typename... NodeTypes >
+    void add_visible_nodes(const NodeTypes&... n) { add_nodes_impl(this, true, n...); }
+
+    template<typename... NodeTypes >
+    void add_nodes(const NodeTypes&... n) { add_nodes_impl(this, false, n...); }
+
+
+    output_ports_type& output_ports() {
+         __TBB_ASSERT(my_output_ports, "output ports not set, call set_external_ports to set output ports");
+         return *my_output_ports;
+    }
+
+};  // class composite_node
+
+template<typename Gateway>
+class async_body_base: no_assign {
+public:
+    typedef Gateway gateway_type;
+
+    async_body_base(gateway_type *gateway): my_gateway(gateway) { }
+    void set_gateway(gateway_type *gateway) {
+        my_gateway = gateway;
+    }
+
+protected:
+    gateway_type *my_gateway;
+};
+
+template<typename Input, typename Ports, typename Gateway, typename Body>
+class async_body: public async_body_base<Gateway> {
+public:
+    typedef async_body_base<Gateway> base_type;
+    typedef Gateway gateway_type;
+
+    async_body(const Body &body, gateway_type *gateway)
+        : base_type(gateway), my_body(body) { }
+
+    void operator()( const Input &v, Ports & ) {
+        my_body(v, *this->my_gateway);
+    }
+
+    Body get_body() { return my_body; }
+
+private:
+    Body my_body;
+};
+
+//! Implements async node
+template < typename Input, typename Output,
+           typename Policy = queueing_lightweight >
+class async_node
+    : public multifunction_node< Input, std::tuple< Output >, Policy >, public sender< Output >
+{
+    typedef multifunction_node< Input, std::tuple< Output >, Policy > base_type;
+    typedef multifunction_input<
+        Input, typename base_type::output_ports_type, Policy, cache_aligned_allocator<Input>> mfn_input_type;
+
+public:
+    typedef Input input_type;
+    typedef Output output_type;
+    typedef receiver<input_type> receiver_type;
+    typedef receiver<output_type> successor_type;
+    typedef sender<input_type> predecessor_type;
+    typedef receiver_gateway<output_type> gateway_type;
+    typedef async_body_base<gateway_type> async_body_base_type;
+    typedef typename base_type::output_ports_type output_ports_type;
+
+private:
+    class receiver_gateway_impl: public receiver_gateway<Output> {
+    public:
+        receiver_gateway_impl(async_node* node): my_node(node) {}
+        void reserve_wait() override {
+            fgt_async_reserve(static_cast<typename async_node::receiver_type *>(my_node), &my_node->my_graph);
+            my_node->my_graph.reserve_wait();
+        }
+
+        void release_wait() override {
+            async_node* n = my_node;
+            graph* g = &n->my_graph;
+            g->release_wait();
+            fgt_async_commit(static_cast<typename async_node::receiver_type *>(n), g);
+        }
+
+        //! Implements gateway_type::try_put for an external activity to submit a message to FG
+        bool try_put(const Output &i) override {
+            return my_node->try_put_impl(i);
+        }
+
+    private:
+        async_node* my_node;
+    } my_gateway;
+
+    //The substitute of 'this' for member construction, to prevent compiler warnings
+    async_node* self() { return this; }
+
+    //! Implements gateway_type::try_put for an external activity to submit a message to FG
+    bool try_put_impl(const Output &i) {
+        multifunction_output<Output> &port_0 = output_port<0>(*this);
+        broadcast_cache<output_type>& port_successors = port_0.successors();
+        fgt_async_try_put_begin(this, &port_0);
+        // TODO revamp: change to std::list<graph_task*>
+        graph_task_list tasks;
+        bool is_at_least_one_put_successful = port_successors.gather_successful_try_puts(i, tasks);
+        __TBB_ASSERT( is_at_least_one_put_successful || tasks.empty(),
+                      "Return status is inconsistent with the method operation." );
+
+        while( !tasks.empty() ) {
+            enqueue_in_graph_arena(this->my_graph, tasks.pop_front());
+        }
+        fgt_async_try_put_end(this, &port_0);
+        return is_at_least_one_put_successful;
+    }
+
+public:
+    template<typename Body>
+    __TBB_NOINLINE_SYM async_node(
+        graph &g, size_t concurrency,
+        Body body, Policy = Policy(), node_priority_t a_priority = no_priority
+    ) : base_type(
+        g, concurrency,
+        async_body<Input, typename base_type::output_ports_type, gateway_type, Body>
+        (body, &my_gateway), a_priority ), my_gateway(self()) {
+        fgt_multioutput_node_with_body<1>(
+            CODEPTR(), FLOW_ASYNC_NODE,
+            &this->my_graph, static_cast<receiver<input_type> *>(this),
+            this->output_ports(), this->my_body
+        );
+    }
+
+    template <typename Body, typename... Args>
+    __TBB_NOINLINE_SYM async_node(graph& g, size_t concurrency, Body body, node_priority_t a_priority)
+        : async_node(g, concurrency, body, Policy(), a_priority) {}
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename Body, typename... Args>
+    __TBB_NOINLINE_SYM async_node(
+        const node_set<Args...>& nodes, size_t concurrency, Body body,
+        Policy = Policy(), node_priority_t a_priority = no_priority )
+        : async_node(nodes.graph_reference(), concurrency, body, a_priority) {
+        make_edges_in_order(nodes, *this);
+    }
+
+    template <typename Body, typename... Args>
+    __TBB_NOINLINE_SYM async_node(const node_set<Args...>& nodes, size_t concurrency, Body body, node_priority_t a_priority)
+        : async_node(nodes, concurrency, body, Policy(), a_priority) {}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+    __TBB_NOINLINE_SYM async_node( const async_node &other ) : base_type(other), sender<Output>(), my_gateway(self()) {
+        static_cast<async_body_base_type*>(this->my_body->get_body_ptr())->set_gateway(&my_gateway);
+        static_cast<async_body_base_type*>(this->my_init_body->get_body_ptr())->set_gateway(&my_gateway);
+
+        fgt_multioutput_node_with_body<1>( CODEPTR(), FLOW_ASYNC_NODE,
+                &this->my_graph, static_cast<receiver<input_type> *>(this),
+                this->output_ports(), this->my_body );
+    }
+
+    gateway_type& gateway() {
+        return my_gateway;
+    }
+
+    // Define sender< Output >
+
+    //! Add a new successor to this node
+    bool register_successor(successor_type&) override {
+        __TBB_ASSERT(false, "Successors must be registered only via ports");
+        return false;
+    }
+
+    //! Removes a successor from this node
+    bool remove_successor(successor_type&) override {
+        __TBB_ASSERT(false, "Successors must be removed only via ports");
+        return false;
+    }
+
+    template<typename Body>
+    Body copy_function_object() {
+        typedef multifunction_body<input_type, typename base_type::output_ports_type> mfn_body_type;
+        typedef async_body<Input, typename base_type::output_ports_type, gateway_type, Body> async_body_type;
+        mfn_body_type &body_ref = *this->my_body;
+        async_body_type ab = *static_cast<async_body_type*>(dynamic_cast< multifunction_body_leaf<input_type, typename base_type::output_ports_type, async_body_type> & >(body_ref).get_body_ptr());
+        return ab.get_body();
+    }
+
+protected:
+
+    void reset_node( reset_flags f) override {
+       base_type::reset_node(f);
+    }
+};
+
+#include "detail/_flow_graph_node_set_impl.h"
+
+template< typename T >
+class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
+public:
+    typedef T input_type;
+    typedef T output_type;
+    typedef typename receiver<input_type>::predecessor_type predecessor_type;
+    typedef typename sender<output_type>::successor_type successor_type;
+
+    __TBB_NOINLINE_SYM explicit overwrite_node(graph &g)
+        : graph_node(g), my_successors(this), my_buffer_is_valid(false)
+    {
+        fgt_node( CODEPTR(), FLOW_OVERWRITE_NODE, &this->my_graph,
+                  static_cast<receiver<input_type> *>(this), static_cast<sender<output_type> *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    overwrite_node(const node_set<Args...>& nodes) : overwrite_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    //! Copy constructor; doesn't take anything from src; default won't work
+    __TBB_NOINLINE_SYM overwrite_node( const overwrite_node& src ) : overwrite_node(src.my_graph) {}
+
+    ~overwrite_node() {}
+
+    bool register_successor( successor_type &s ) override {
+        spin_mutex::scoped_lock l( my_mutex );
+        if (my_buffer_is_valid && is_graph_active( my_graph )) {
+            // We have a valid value that must be forwarded immediately.
+            bool ret = s.try_put( my_buffer );
+            if ( ret ) {
+                // We add the successor that accepted our put
+                my_successors.register_successor( s );
+            } else {
+                // In case of reservation a race between the moment of reservation and register_successor can appear,
+                // because failed reserve does not mean that register_successor is not ready to put a message immediately.
+                // We have some sort of infinite loop: reserving node tries to set pull state for the edge,
+                // but overwrite_node tries to return push state back. That is why we have to break this loop with task creation.
+                small_object_allocator allocator{};
+                typedef register_predecessor_task task_type;
+                graph_task* t = allocator.new_object<task_type>(graph_reference(), allocator, *this, s);
+                graph_reference().reserve_wait();
+                spawn_in_graph_arena( my_graph, *t );
+            }
+        } else {
+            // No valid value yet, just add as successor
+            my_successors.register_successor( s );
+        }
+        return true;
+    }
+
+    bool remove_successor( successor_type &s ) override {
+        spin_mutex::scoped_lock l( my_mutex );
+        my_successors.remove_successor(s);
+        return true;
+    }
+
+    bool try_get( input_type &v ) override {
+        spin_mutex::scoped_lock l( my_mutex );
+        if ( my_buffer_is_valid ) {
+            v = my_buffer;
+            return true;
+        }
+        return false;
+    }
+
+    //! Reserves an item
+    bool try_reserve( T &v ) override {
+        return try_get(v);
+    }
+
+    //! Releases the reserved item
+    bool try_release() override { return true; }
+
+    //! Consumes the reserved item
+    bool try_consume() override { return true; }
+
+    bool is_valid() {
+       spin_mutex::scoped_lock l( my_mutex );
+       return my_buffer_is_valid;
+    }
+
+    void clear() {
+       spin_mutex::scoped_lock l( my_mutex );
+       my_buffer_is_valid = false;
+    }
+
+protected:
+
+    template< typename R, typename B > friend class run_and_put_task;
+    template<typename X, typename Y> friend class broadcast_cache;
+    template<typename X, typename Y> friend class round_robin_cache;
+    graph_task* try_put_task( const input_type &v ) override {
+        spin_mutex::scoped_lock l( my_mutex );
+        return try_put_task_impl(v);
+    }
+
+    graph_task * try_put_task_impl(const input_type &v) {
+        my_buffer = v;
+        my_buffer_is_valid = true;
+        graph_task* rtask = my_successors.try_put_task(v);
+        if (!rtask) rtask = SUCCESSFULLY_ENQUEUED;
+        return rtask;
+    }
+
+    graph& graph_reference() const override {
+        return my_graph;
+    }
+
+    //! Breaks an infinite loop between the node reservation and register_successor call
+    struct register_predecessor_task : public graph_task {
+        register_predecessor_task(
+            graph& g, small_object_allocator& allocator, predecessor_type& owner, successor_type& succ)
+            : graph_task(g, allocator), o(owner), s(succ) {};
+
+        task* execute(execution_data& ed) override {
+            // TODO revamp: investigate why qualification is needed for register_successor() call
+            using tbb::detail::d1::register_predecessor;
+            using tbb::detail::d1::register_successor;
+            if ( !register_predecessor(s, o) ) {
+                register_successor(o, s);
+            }
+            finalize(ed);
+            return nullptr;
+        }
+
+        predecessor_type& o;
+        successor_type& s;
+    };
+
+    spin_mutex my_mutex;
+    broadcast_cache< input_type, null_rw_mutex > my_successors;
+    input_type my_buffer;
+    bool my_buffer_is_valid;
+
+    void reset_node( reset_flags f) override {
+        my_buffer_is_valid = false;
+       if (f&rf_clear_edges) {
+           my_successors.clear();
+       }
+    }
+};  // overwrite_node
+
+template< typename T >
+class write_once_node : public overwrite_node<T> {
+public:
+    typedef T input_type;
+    typedef T output_type;
+    typedef overwrite_node<T> base_type;
+    typedef typename receiver<input_type>::predecessor_type predecessor_type;
+    typedef typename sender<output_type>::successor_type successor_type;
+
+    //! Constructor
+    __TBB_NOINLINE_SYM explicit write_once_node(graph& g) : base_type(g) {
+        fgt_node( CODEPTR(), FLOW_WRITE_ONCE_NODE, &(this->my_graph),
+                                 static_cast<receiver<input_type> *>(this),
+                                 static_cast<sender<output_type> *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    write_once_node(const node_set<Args...>& nodes) : write_once_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    //! Copy constructor: call base class copy constructor
+    __TBB_NOINLINE_SYM write_once_node( const write_once_node& src ) : base_type(src) {
+        fgt_node( CODEPTR(), FLOW_WRITE_ONCE_NODE, &(this->my_graph),
+                                 static_cast<receiver<input_type> *>(this),
+                                 static_cast<sender<output_type> *>(this) );
+    }
+
+protected:
+    template< typename R, typename B > friend class run_and_put_task;
+    template<typename X, typename Y> friend class broadcast_cache;
+    template<typename X, typename Y> friend class round_robin_cache;
+    graph_task *try_put_task( const T &v ) override {
+        spin_mutex::scoped_lock l( this->my_mutex );
+        return this->my_buffer_is_valid ? NULL : this->try_put_task_impl(v);
+    }
+}; // write_once_node
+
+inline void set_name(const graph& g, const char *name) {
+    fgt_graph_desc(&g, name);
+}
+
+template <typename Output>
+inline void set_name(const input_node<Output>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename Input, typename Output, typename Policy>
+inline void set_name(const function_node<Input, Output, Policy>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename Output, typename Policy>
+inline void set_name(const continue_node<Output,Policy>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename T>
+inline void set_name(const broadcast_node<T>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename T>
+inline void set_name(const buffer_node<T>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename T>
+inline void set_name(const queue_node<T>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename T>
+inline void set_name(const sequencer_node<T>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename T, typename Compare>
+inline void set_name(const priority_queue_node<T, Compare>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename T, typename DecrementType>
+inline void set_name(const limiter_node<T, DecrementType>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename OutputTuple, typename JP>
+inline void set_name(const join_node<OutputTuple, JP>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename... Types>
+inline void set_name(const indexer_node<Types...>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename T>
+inline void set_name(const overwrite_node<T>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename T>
+inline void set_name(const write_once_node<T>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template<typename Input, typename Output, typename Policy>
+inline void set_name(const multifunction_node<Input, Output, Policy>& node, const char *name) {
+    fgt_multioutput_node_desc(&node, name);
+}
+
+template<typename TupleType>
+inline void set_name(const split_node<TupleType>& node, const char *name) {
+    fgt_multioutput_node_desc(&node, name);
+}
+
+template< typename InputTuple, typename OutputTuple >
+inline void set_name(const composite_node<InputTuple, OutputTuple>& node, const char *name) {
+    fgt_multiinput_multioutput_node_desc(&node, name);
+}
+
+template<typename Input, typename Output, typename Policy>
+inline void set_name(const async_node<Input, Output, Policy>& node, const char *name)
+{
+    fgt_multioutput_node_desc(&node, name);
+}
+} // d1
+} // detail
+} // tbb
+
+
+// Include deduction guides for node classes
+#include "detail/_flow_graph_nodes_deduction.h"
+
+namespace tbb {
+namespace flow {
+inline namespace v1 {
+    using detail::d1::receiver;
+    using detail::d1::sender;
+
+    using detail::d1::serial;
+    using detail::d1::unlimited;
+
+    using detail::d1::reset_flags;
+    using detail::d1::rf_reset_protocol;
+    using detail::d1::rf_reset_bodies;
+    using detail::d1::rf_clear_edges;
+
+    using detail::d1::graph;
+    using detail::d1::graph_node;
+    using detail::d1::continue_msg;
+
+    using detail::d1::input_node;
+    using detail::d1::function_node;
+    using detail::d1::multifunction_node;
+    using detail::d1::split_node;
+    using detail::d1::output_port;
+    using detail::d1::indexer_node;
+    using detail::d1::tagged_msg;
+    using detail::d1::cast_to;
+    using detail::d1::is_a;
+    using detail::d1::continue_node;
+    using detail::d1::overwrite_node;
+    using detail::d1::write_once_node;
+    using detail::d1::broadcast_node;
+    using detail::d1::buffer_node;
+    using detail::d1::queue_node;
+    using detail::d1::sequencer_node;
+    using detail::d1::priority_queue_node;
+    using detail::d1::limiter_node;
+    using namespace detail::d1::graph_policy_namespace;
+    using detail::d1::join_node;
+    using detail::d1::input_port;
+    using detail::d1::copy_body;
+    using detail::d1::make_edge;
+    using detail::d1::remove_edge;
+    using detail::d1::tag_value;
+    using detail::d1::composite_node;
+    using detail::d1::async_node;
+    using detail::d1::node_priority_t;
+    using detail::d1::no_priority;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    using detail::d1::follows;
+    using detail::d1::precedes;
+    using detail::d1::make_node_set;
+    using detail::d1::make_edges;
+#endif
+
+} // v1
+} // flow
+
+    using detail::d1::flow_control;
+
+namespace profiling {
+    using detail::d1::set_name;
+} // profiling
+
+} // tbb
+
+
+#if TBB_USE_PROFILING_TOOLS  && ( __linux__ || __APPLE__ )
+   // We don't do pragma pop here, since it still gives warning on the USER side
+   #undef __TBB_NOINLINE_SYM
+#endif
+
+#endif // __TBB_flow_graph_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/flow_graph_abstractions.h b/contrib/libs/tbb/include/oneapi/tbb/flow_graph_abstractions.h
new file mode 100644
index 0000000000..121f167c4d
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/flow_graph_abstractions.h
@@ -0,0 +1,51 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_flow_graph_abstractions_H
+#define __TBB_flow_graph_abstractions_H
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! Pure virtual template classes that define interfaces for async communication
+class graph_proxy {
+public:
+    //! Inform a graph that messages may come from outside, to prevent premature graph completion
+    virtual void reserve_wait() = 0;
+
+    //! Inform a graph that a previous call to reserve_wait is no longer in effect
+    virtual void release_wait() = 0;
+
+    virtual ~graph_proxy() {}
+};
+
+template <typename Input>
+class receiver_gateway : public graph_proxy {
+public:
+    //! Type of inputing data into FG.
+    typedef Input input_type;
+
+    //! Submit signal from an asynchronous activity to FG.
+    virtual bool try_put(const input_type&) = 0;
+};
+
+} // d1
+
+
+} // detail
+} // tbb
+#endif
diff --git a/contrib/libs/tbb/include/oneapi/tbb/global_control.h b/contrib/libs/tbb/include/oneapi/tbb/global_control.h
new file mode 100644
index 0000000000..80177b6b82
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/global_control.h
@@ -0,0 +1,188 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_global_control_H
+#define __TBB_global_control_H
+
+#include "detail/_config.h"
+#include "detail/_namespace_injection.h"
+#include "detail/_assert.h"
+#include "detail/_template_helpers.h"
+#include "detail/_exception.h"
+
+#if __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+#include <new> // std::nothrow_t
+#endif
+#include <cstddef>
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+class global_control;
+#if __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+class task_scheduler_handle;
+#endif
+}
+
+namespace r1 {
+void __TBB_EXPORTED_FUNC create(d1::global_control&);
+void __TBB_EXPORTED_FUNC destroy(d1::global_control&);
+std::size_t __TBB_EXPORTED_FUNC global_control_active_value(int);
+struct global_control_impl;
+struct control_storage_comparator;
+#if __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+void release_impl(d1::task_scheduler_handle& handle);
+bool finalize_impl(d1::task_scheduler_handle& handle);
+void __TBB_EXPORTED_FUNC get(d1::task_scheduler_handle&);
+bool __TBB_EXPORTED_FUNC finalize(d1::task_scheduler_handle&, std::intptr_t mode);
+#endif
+}
+
+namespace d1 {
+
+class global_control {
+public:
+    enum parameter {
+        max_allowed_parallelism,
+        thread_stack_size,
+        terminate_on_exception,
+#if __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+        scheduler_handle, // not a public parameter
+#else
+        reserved1, // not a public parameter
+#endif
+        parameter_max // insert new parameters above this point
+    };
+
+    global_control(parameter p, std::size_t value) :
+        my_value(value), my_reserved(), my_param(p) {
+        suppress_unused_warning(my_reserved);
+        __TBB_ASSERT(my_param < parameter_max, "Invalid parameter");
+#if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00)
+        // For Windows 8 Store* apps it's impossible to set stack size
+        if (p==thread_stack_size)
+            return;
+#elif __TBB_x86_64 && (_WIN32 || _WIN64)
+        if (p==thread_stack_size)
+            __TBB_ASSERT_RELEASE((unsigned)value == value, "Stack size is limited to unsigned int range");
+#endif
+        if (my_param==max_allowed_parallelism)
+            __TBB_ASSERT_RELEASE(my_value>0, "max_allowed_parallelism cannot be 0.");
+        r1::create(*this);
+    }
+
+    ~global_control() {
+        __TBB_ASSERT(my_param < parameter_max, "Invalid parameter");
+#if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00)
+        // For Windows 8 Store* apps it's impossible to set stack size
+        if (my_param==thread_stack_size)
+            return;
+#endif
+        r1::destroy(*this);
+    }
+
+    static std::size_t active_value(parameter p) {
+        __TBB_ASSERT(p < parameter_max, "Invalid parameter");
+        return r1::global_control_active_value((int)p);
+    }
+
+private:
+    std::size_t my_value;
+    std::intptr_t my_reserved; // TODO: substitution of global_control* not to break backward compatibility
+    parameter my_param;
+
+    friend struct r1::global_control_impl;
+    friend struct r1::control_storage_comparator;
+};
+
+#if __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+//! Finalization options.
+//! Outside of the class to avoid extensive friendship.
+static constexpr std::intptr_t release_nothrowing = 0;
+static constexpr std::intptr_t finalize_nothrowing = 1;
+static constexpr std::intptr_t finalize_throwing = 2;
+
+//! User side wrapper for a task scheduler lifetime control object
+class task_scheduler_handle {
+public:
+    task_scheduler_handle() = default;
+    ~task_scheduler_handle() {
+        release(*this);
+    }
+
+    //! No copy
+    task_scheduler_handle(const task_scheduler_handle& other) = delete;
+    task_scheduler_handle& operator=(const task_scheduler_handle& other) = delete;
+
+    //! Move only
+    task_scheduler_handle(task_scheduler_handle&& other) noexcept : m_ctl{nullptr} {
+        std::swap(m_ctl, other.m_ctl);
+    }
+    task_scheduler_handle& operator=(task_scheduler_handle&& other) noexcept {
+        std::swap(m_ctl, other.m_ctl);
+        return *this;
+    };
+
+    //! Get and active instance of task_scheduler_handle
+    static task_scheduler_handle get() {
+         task_scheduler_handle handle;
+         r1::get(handle);
+         return handle;
+    }
+
+    //! Release the reference and deactivate handle
+    static void release(task_scheduler_handle& handle) {
+        if (handle.m_ctl != nullptr) {
+            r1::finalize(handle, release_nothrowing);
+        }
+    }
+
+private:
+    friend void r1::release_impl(task_scheduler_handle& handle);
+    friend bool r1::finalize_impl(task_scheduler_handle& handle);
+    friend void __TBB_EXPORTED_FUNC r1::get(task_scheduler_handle&);
+
+    global_control* m_ctl{nullptr};
+};
+
+#if TBB_USE_EXCEPTIONS
+//! Waits for worker threads termination. Throws exception on error.
+inline void finalize(task_scheduler_handle& handle) {
+    r1::finalize(handle, finalize_throwing);
+}
+#endif
+//! Waits for worker threads termination. Returns false on error.
+inline bool finalize(task_scheduler_handle& handle, const std::nothrow_t&) noexcept {
+    return r1::finalize(handle, finalize_nothrowing);
+}
+#endif // __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::global_control;
+#if __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+using detail::d1::finalize;
+using detail::d1::task_scheduler_handle;
+using detail::r1::unsafe_wait;
+#endif
+} // namespace v1
+
+} // namespace tbb
+
+#endif // __TBB_global_control_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/info.h b/contrib/libs/tbb/include/oneapi/tbb/info.h
new file mode 100644
index 0000000000..21475a4d00
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/info.h
@@ -0,0 +1,137 @@
+/*
+    Copyright (c) 2019-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_info_H
+#define __TBB_info_H
+
+#include "detail/_config.h"
+#include "detail/_namespace_injection.h"
+
+#if __TBB_ARENA_BINDING
+#include <vector>
+
+namespace tbb {
+namespace detail {
+
+namespace d1{
+
+using numa_node_id = int;
+using core_type_id = int;
+
+// TODO: consider version approach to resolve backward compatibility potential issues.
+struct constraints {
+#if !__TBB_CPP20_PRESENT
+    constraints(numa_node_id id = -1, int maximal_concurrency = -1)
+        : numa_id(id)
+        , max_concurrency(maximal_concurrency)
+#if __TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION_PRESENT
+        , core_type(-1)
+        , max_threads_per_core(-1)
+#endif
+    {}
+#endif /*!__TBB_CPP20_PRESENT*/
+
+    constraints& set_numa_id(numa_node_id id) {
+        numa_id = id;
+        return *this;
+    }
+    constraints& set_max_concurrency(int maximal_concurrency) {
+        max_concurrency = maximal_concurrency;
+        return *this;
+    }
+#if __TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION_PRESENT
+    constraints& set_core_type(core_type_id id) {
+        core_type = id;
+        return *this;
+    }
+    constraints& set_max_threads_per_core(int threads_number) {
+        max_threads_per_core = threads_number;
+        return *this;
+    }
+#endif
+
+    numa_node_id numa_id = -1;
+    int max_concurrency = -1;
+#if __TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION_PRESENT
+    core_type_id core_type = -1;
+    int max_threads_per_core = -1;
+#endif
+};
+
+} // namespace d1
+
+namespace r1 {
+unsigned __TBB_EXPORTED_FUNC numa_node_count();
+void __TBB_EXPORTED_FUNC fill_numa_indices(int* index_array);
+int __TBB_EXPORTED_FUNC numa_default_concurrency(int numa_id);
+
+// Reserved fields are required to save binary backward compatibility in case of future changes.
+// They must be defined to 0 at this moment.
+unsigned __TBB_EXPORTED_FUNC core_type_count(intptr_t reserved = 0);
+void __TBB_EXPORTED_FUNC fill_core_type_indices(int* index_array, intptr_t reserved = 0);
+
+int __TBB_EXPORTED_FUNC constraints_default_concurrency(const d1::constraints& c, intptr_t reserved = 0);
+int __TBB_EXPORTED_FUNC constraints_threads_per_core(const d1::constraints& c, intptr_t reserved = 0);
+} // namespace r1
+
+namespace d1 {
+
+inline std::vector<numa_node_id> numa_nodes() {
+    std::vector<numa_node_id> node_indices(r1::numa_node_count());
+    r1::fill_numa_indices(node_indices.data());
+    return node_indices;
+}
+
+inline int default_concurrency(numa_node_id id = -1) {
+    return r1::numa_default_concurrency(id);
+}
+
+#if __TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION_PRESENT
+inline std::vector<core_type_id> core_types() {
+    std::vector<int> core_type_indexes(r1::core_type_count());
+    r1::fill_core_type_indices(core_type_indexes.data());
+    return core_type_indexes;
+}
+
+inline int default_concurrency(constraints c) {
+    return r1::constraints_default_concurrency(c);
+}
+#endif /*__TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION_PRESENT*/
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::numa_node_id;
+#if __TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION_PRESENT
+using detail::d1::core_type_id;
+#endif
+
+namespace info {
+using detail::d1::numa_nodes;
+#if __TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION_PRESENT
+using detail::d1::core_types;
+#endif
+
+using detail::d1::default_concurrency;
+} // namespace info
+} // namespace v1
+
+} // namespace tbb
+
+#endif /*__TBB_ARENA_BINDING*/
+
+#endif /*__TBB_info_H*/
diff --git a/contrib/libs/tbb/include/oneapi/tbb/memory_pool.h b/contrib/libs/tbb/include/oneapi/tbb/memory_pool.h
new file mode 100644
index 0000000000..6e913c6713
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/memory_pool.h
@@ -0,0 +1,272 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_memory_pool_H
+#define __TBB_memory_pool_H
+
+#if !TBB_PREVIEW_MEMORY_POOL
+#error Set TBB_PREVIEW_MEMORY_POOL to include memory_pool.h
+#endif
+/** @file */
+
+#include "scalable_allocator.h"
+
+#include <new> // std::bad_alloc
+#include <stdexcept> // std::runtime_error, std::invalid_argument
+#include <utility> // std::forward
+
+
+#if __TBB_EXTRA_DEBUG
+#define __TBBMALLOC_ASSERT ASSERT
+#else
+#define __TBBMALLOC_ASSERT(a,b) ((void)0)
+#endif
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! Base of thread-safe pool allocator for variable-size requests
+class pool_base : no_copy {
+    // Pool interface is separate from standard allocator classes because it has
+    // to maintain internal state, no copy or assignment. Move and swap are possible.
+public:
+    //! Reset pool to reuse its memory (free all objects at once)
+    void recycle() { rml::pool_reset(my_pool); }
+
+    //! The "malloc" analogue to allocate block of memory of size bytes
+    void *malloc(size_t size) { return rml::pool_malloc(my_pool, size); }
+
+    //! The "free" analogue to discard a previously allocated piece of memory.
+    void free(void* ptr) { rml::pool_free(my_pool, ptr); }
+
+    //! The "realloc" analogue complementing pool_malloc.
+    // Enables some low-level optimization possibilities
+    void *realloc(void* ptr, size_t size) {
+        return rml::pool_realloc(my_pool, ptr, size);
+    }
+
+protected:
+    //! destroy pool - must be called in a child class
+    void destroy() { rml::pool_destroy(my_pool); }
+
+    rml::MemoryPool *my_pool;
+};
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Workaround for erroneous "unreferenced parameter" warning in method destroy.
+    #pragma warning (push)
+    #pragma warning (disable: 4100)
+#endif
+
+//! Meets "allocator" requirements of ISO C++ Standard, Section 20.1.5
+/** @ingroup memory_allocation */
+template<typename T, typename P = pool_base>
+class memory_pool_allocator {
+protected:
+    typedef P pool_type;
+    pool_type *my_pool;
+    template<typename U, typename R>
+    friend class memory_pool_allocator;
+    template<typename V, typename U, typename R>
+    friend bool operator==( const memory_pool_allocator<V,R>& a, const memory_pool_allocator<U,R>& b);
+    template<typename V, typename U, typename R>
+    friend bool operator!=( const memory_pool_allocator<V,R>& a, const memory_pool_allocator<U,R>& b);
+public:
+    typedef T value_type;
+    typedef value_type* pointer;
+    typedef const value_type* const_pointer;
+    typedef value_type& reference;
+    typedef const value_type& const_reference;
+    typedef size_t size_type;
+    typedef ptrdiff_t difference_type;
+    template<typename U> struct rebind {
+        typedef memory_pool_allocator<U, P> other;
+    };
+
+    explicit memory_pool_allocator(pool_type &pool) throw() : my_pool(&pool) {}
+    memory_pool_allocator(const memory_pool_allocator& src) throw() : my_pool(src.my_pool) {}
+    template<typename U>
+    memory_pool_allocator(const memory_pool_allocator<U,P>& src) throw() : my_pool(src.my_pool) {}
+
+    pointer address(reference x) const { return &x; }
+    const_pointer address(const_reference x) const { return &x; }
+
+    //! Allocate space for n objects.
+    pointer allocate( size_type n, const void* /*hint*/ = 0) {
+        pointer p = static_cast<pointer>( my_pool->malloc( n*sizeof(value_type) ) );
+        if (!p)
+            throw_exception(std::bad_alloc());
+        return p;
+    }
+    //! Free previously allocated block of memory.
+    void deallocate( pointer p, size_type ) {
+        my_pool->free(p);
+    }
+    //! Largest value for which method allocate might succeed.
+    size_type max_size() const throw() {
+        size_type max = static_cast<size_type>(-1) / sizeof (value_type);
+        return (max > 0 ? max : 1);
+    }
+    //! Copy-construct value at location pointed to by p.
+
+    template<typename U, typename... Args>
+    void construct(U *p, Args&&... args)
+        { ::new((void *)p) U(std::forward<Args>(args)...); }
+
+    //! Destroy value at location pointed to by p.
+    void destroy( pointer p ) { p->~value_type(); }
+
+};
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    #pragma warning (pop)
+#endif // warning 4100 is back
+
+//! Analogous to std::allocator<void>, as defined in ISO C++ Standard, Section 20.4.1
+/** @ingroup memory_allocation */
+template<typename P>
+class memory_pool_allocator<void, P> {
+public:
+    typedef P pool_type;
+    typedef void* pointer;
+    typedef const void* const_pointer;
+    typedef void value_type;
+    template<typename U> struct rebind {
+        typedef memory_pool_allocator<U, P> other;
+    };
+
+    explicit memory_pool_allocator( pool_type &pool) throw() : my_pool(&pool) {}
+    memory_pool_allocator( const memory_pool_allocator& src) throw() : my_pool(src.my_pool) {}
+    template<typename U>
+    memory_pool_allocator(const memory_pool_allocator<U,P>& src) throw() : my_pool(src.my_pool) {}
+
+protected:
+    pool_type *my_pool;
+    template<typename U, typename R>
+    friend class memory_pool_allocator;
+    template<typename V, typename U, typename R>
+    friend bool operator==( const memory_pool_allocator<V,R>& a, const memory_pool_allocator<U,R>& b);
+    template<typename V, typename U, typename R>
+    friend bool operator!=( const memory_pool_allocator<V,R>& a, const memory_pool_allocator<U,R>& b);
+};
+
+template<typename T, typename U, typename P>
+inline bool operator==( const memory_pool_allocator<T,P>& a, const memory_pool_allocator<U,P>& b) {return a.my_pool==b.my_pool;}
+
+template<typename T, typename U, typename P>
+inline bool operator!=( const memory_pool_allocator<T,P>& a, const memory_pool_allocator<U,P>& b) {return a.my_pool!=b.my_pool;}
+
+//! Thread-safe growable pool allocator for variable-size requests
+template <typename Alloc>
+class memory_pool : public pool_base {
+    Alloc my_alloc; // TODO: base-class optimization
+    static void *allocate_request(intptr_t pool_id, size_t & bytes);
+    static int deallocate_request(intptr_t pool_id, void*, size_t raw_bytes);
+
+public:
+    //! construct pool with underlying allocator
+    explicit memory_pool(const Alloc &src = Alloc());
+
+    //! destroy pool
+    ~memory_pool() { destroy(); } // call the callbacks first and destroy my_alloc latter
+};
+
+class fixed_pool : public pool_base {
+    void *my_buffer;
+    size_t my_size;
+    inline static void *allocate_request(intptr_t pool_id, size_t & bytes);
+
+public:
+    //! construct pool with underlying allocator
+    inline fixed_pool(void *buf, size_t size);
+    //! destroy pool
+    ~fixed_pool() { destroy(); }
+};
+
+//////////////// Implementation ///////////////
+
+template <typename Alloc>
+memory_pool<Alloc>::memory_pool(const Alloc &src) : my_alloc(src) {
+    rml::MemPoolPolicy args(allocate_request, deallocate_request,
+                            sizeof(typename Alloc::value_type));
+    rml::MemPoolError res = rml::pool_create_v1(intptr_t(this), &args, &my_pool);
+    if (res!=rml::POOL_OK)
+        throw_exception(std::runtime_error("Can't create pool"));
+}
+template <typename Alloc>
+void *memory_pool<Alloc>::allocate_request(intptr_t pool_id, size_t & bytes) {
+    memory_pool<Alloc> &self = *reinterpret_cast<memory_pool<Alloc>*>(pool_id);
+    const size_t unit_size = sizeof(typename Alloc::value_type);
+    __TBBMALLOC_ASSERT( 0 == bytes%unit_size, NULL);
+    void *ptr;
+#if TBB_USE_EXCEPTIONS
+    try {
+#endif
+        ptr = self.my_alloc.allocate( bytes/unit_size );
+#if TBB_USE_EXCEPTIONS
+    } catch(...) {
+        return 0;
+    }
+#endif
+    return ptr;
+}
+#if __TBB_MSVC_UNREACHABLE_CODE_IGNORED
+    // Workaround for erroneous "unreachable code" warning in the template below.
+    // Specific for VC++ 17-18 compiler
+    #pragma warning (push)
+    #pragma warning (disable: 4702)
+#endif
+template <typename Alloc>
+int memory_pool<Alloc>::deallocate_request(intptr_t pool_id, void* raw_ptr, size_t raw_bytes) {
+    memory_pool<Alloc> &self = *reinterpret_cast<memory_pool<Alloc>*>(pool_id);
+    const size_t unit_size = sizeof(typename Alloc::value_type);
+    __TBBMALLOC_ASSERT( 0 == raw_bytes%unit_size, NULL);
+    self.my_alloc.deallocate( static_cast<typename Alloc::value_type*>(raw_ptr), raw_bytes/unit_size );
+    return 0;
+}
+#if __TBB_MSVC_UNREACHABLE_CODE_IGNORED
+    #pragma warning (pop)
+#endif
+inline fixed_pool::fixed_pool(void *buf, size_t size) : my_buffer(buf), my_size(size) {
+    if (!buf || !size)
+        // TODO: improve support for mode with exceptions disabled
+        throw_exception(std::invalid_argument("Zero in parameter is invalid"));
+    rml::MemPoolPolicy args(allocate_request, 0, size, /*fixedPool=*/true);
+    rml::MemPoolError res = rml::pool_create_v1(intptr_t(this), &args, &my_pool);
+    if (res!=rml::POOL_OK)
+        throw_exception(std::runtime_error("Can't create pool"));
+}
+inline void *fixed_pool::allocate_request(intptr_t pool_id, size_t & bytes) {
+    fixed_pool &self = *reinterpret_cast<fixed_pool*>(pool_id);
+    __TBBMALLOC_ASSERT(0 != self.my_size, "The buffer must not be used twice.");
+    bytes = self.my_size;
+    self.my_size = 0; // remember that buffer has been used
+    return self.my_buffer;
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::memory_pool_allocator;
+using detail::d1::memory_pool;
+using detail::d1::fixed_pool;
+} // inline namepspace v1
+} // namespace tbb
+
+#undef __TBBMALLOC_ASSERT
+#endif// __TBB_memory_pool_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/null_mutex.h b/contrib/libs/tbb/include/oneapi/tbb/null_mutex.h
new file mode 100644
index 0000000000..8fab863db3
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/null_mutex.h
@@ -0,0 +1,79 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_null_mutex_H
+#define __TBB_null_mutex_H
+
+#include "detail/_config.h"
+#include "detail/_namespace_injection.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! A mutex which does nothing
+/** A null_mutex does no operation and simulates success.
+    @ingroup synchronization */
+class null_mutex {
+public:
+    //! Constructors
+    constexpr null_mutex() noexcept = default;
+
+    //! Destructor
+    ~null_mutex() = default;
+
+    //! No Copy
+    null_mutex(const null_mutex&) = delete;
+    null_mutex& operator=(const null_mutex&) = delete;
+
+    //! Represents acquisition of a mutex.
+    class scoped_lock {
+    public:
+        //! Constructors
+        constexpr scoped_lock() noexcept = default;
+        scoped_lock(null_mutex&) {}
+
+        //! Destructor
+        ~scoped_lock() = default;
+
+        //! No Copy
+        scoped_lock(const scoped_lock&) = delete;
+        scoped_lock& operator=(const scoped_lock&) = delete;
+
+        void acquire(null_mutex&) {}
+        bool try_acquire(null_mutex&) { return true; }
+        void release() {}
+    };
+
+    //! Mutex traits
+    static constexpr bool is_rw_mutex = false;
+    static constexpr bool is_recursive_mutex = true;
+    static constexpr bool is_fair_mutex = true;
+
+    void lock() {}
+    bool try_lock() { return true; }
+    void unlock() {}
+}; // class null_mutex
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::null_mutex;
+} // namespace v1
+} // namespace tbb
+
+#endif /* __TBB_null_mutex_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/null_rw_mutex.h b/contrib/libs/tbb/include/oneapi/tbb/null_rw_mutex.h
new file mode 100644
index 0000000000..8046bc405d
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/null_rw_mutex.h
@@ -0,0 +1,84 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_null_rw_mutex_H
+#define __TBB_null_rw_mutex_H
+
+#include "detail/_config.h"
+#include "detail/_namespace_injection.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! A rw mutex which does nothing
+/** A null_rw_mutex is a rw mutex that does nothing and simulates successful operation.
+    @ingroup synchronization */
+class null_rw_mutex {
+public:
+    //! Constructors
+    constexpr null_rw_mutex() noexcept = default;
+
+    //! Destructor
+    ~null_rw_mutex() = default;
+
+    //! No Copy
+    null_rw_mutex(const null_rw_mutex&) = delete;
+    null_rw_mutex& operator=(const null_rw_mutex&) = delete;
+
+    //! Represents acquisition of a mutex.
+    class scoped_lock {
+    public:
+        //! Constructors
+        constexpr scoped_lock() noexcept = default;
+        scoped_lock(null_rw_mutex&, bool = true) {}
+
+        //! Destructor
+        ~scoped_lock() = default;
+
+        //! No Copy
+        scoped_lock(const scoped_lock&) = delete;
+        scoped_lock& operator=(const scoped_lock&) = delete;
+
+        void acquire(null_rw_mutex&, bool = true) {}
+        bool try_acquire(null_rw_mutex&, bool = true) { return true; }
+        void release() {}
+        bool upgrade_to_writer() { return true; }
+        bool downgrade_to_reader() { return true; }
+    };
+
+    //! Mutex traits
+    static constexpr bool is_rw_mutex = true;
+    static constexpr bool is_recursive_mutex = true;
+    static constexpr bool is_fair_mutex = true;
+
+    void lock() {}
+    bool try_lock() { return true; }
+    void unlock() {}
+    void lock_shared() {}
+    bool try_lock_shared() { return true; }
+    void unlock_shared() {}
+}; // class null_rw_mutex
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::null_rw_mutex;
+} // namespace v1
+} // namespace tbb
+
+#endif /* __TBB_null_rw_mutex_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/parallel_for.h b/contrib/libs/tbb/include/oneapi/tbb/parallel_for.h
new file mode 100644
index 0000000000..ed137d4d09
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/parallel_for.h
@@ -0,0 +1,416 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_parallel_for_H
+#define __TBB_parallel_for_H
+
+#include "detail/_config.h"
+#include "detail/_namespace_injection.h"
+#include "detail/_exception.h"
+#include "detail/_task.h"
+#include "detail/_small_object_pool.h"
+#include "profiling.h"
+
+#include "partitioner.h"
+#include "blocked_range.h"
+#include "task_group.h"
+
+#include <cstddef>
+#include <new>
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! Task type used in parallel_for
+/** @ingroup algorithms */
+template<typename Range, typename Body, typename Partitioner>
+struct start_for : public task {
+    Range my_range;
+    const Body my_body;
+    node* my_parent;
+
+    typename Partitioner::task_partition_type my_partition;
+    small_object_allocator my_allocator;
+
+    task* execute(execution_data&) override;
+    task* cancel(execution_data&) override;
+    void finalize(const execution_data&);
+
+    //! Constructor for root task.
+    start_for( const Range& range, const Body& body, Partitioner& partitioner, small_object_allocator& alloc ) :
+        my_range(range),
+        my_body(body),
+        my_partition(partitioner),
+        my_allocator(alloc) {}
+    //! Splitting constructor used to generate children.
+    /** parent_ becomes left child.  Newly constructed object is right child. */
+    start_for( start_for& parent_, typename Partitioner::split_type& split_obj, small_object_allocator& alloc ) :
+        my_range(parent_.my_range, get_range_split_object<Range>(split_obj)),
+        my_body(parent_.my_body),
+        my_partition(parent_.my_partition, split_obj),
+        my_allocator(alloc) {}
+    //! Construct right child from the given range as response to the demand.
+    /** parent_ remains left child.  Newly constructed object is right child. */
+    start_for( start_for& parent_, const Range& r, depth_t d, small_object_allocator& alloc ) :
+        my_range(r),
+        my_body(parent_.my_body),
+        my_partition(parent_.my_partition, split()),
+        my_allocator(alloc)
+    {
+        my_partition.align_depth( d );
+    }
+    static void run(const Range& range, const Body& body, Partitioner& partitioner) {
+        task_group_context context(PARALLEL_FOR);
+        run(range, body, partitioner, context);
+    }
+
+    static void run(const Range& range, const Body& body, Partitioner& partitioner, task_group_context& context) {
+        if ( !range.empty() ) {
+            small_object_allocator alloc{};
+            start_for& for_task = *alloc.new_object<start_for>(range, body, partitioner, alloc);
+
+            // defer creation of the wait node until task allocation succeeds
+            wait_node wn;
+            for_task.my_parent = &wn;
+            execute_and_wait(for_task, context, wn.m_wait, context);
+        }
+    }
+    //! Run body for range, serves as callback for partitioner
+    void run_body( Range &r ) {
+        my_body( r );
+    }
+
+    //! spawn right task, serves as callback for partitioner
+    void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) {
+       offer_work_impl(ed, *this, split_obj);
+    }
+
+    //! spawn right task, serves as callback for partitioner
+    void offer_work(const Range& r, depth_t d, execution_data& ed) {
+        offer_work_impl(ed, *this, r, d);
+    }
+
+private:
+    template <typename... Args>
+    void offer_work_impl(execution_data& ed, Args&&... constructor_args) {
+        // New right child
+        small_object_allocator alloc{};
+        start_for& right_child = *alloc.new_object<start_for>(ed, std::forward<Args>(constructor_args)..., alloc);
+
+        // New root node as a continuation and ref count. Left and right child attach to the new parent.
+        right_child.my_parent = my_parent = alloc.new_object<tree_node>(ed, my_parent, 2, alloc);
+        // Spawn the right sibling
+        right_child.spawn_self(ed);
+    }
+
+    void spawn_self(execution_data& ed) {
+        my_partition.spawn_task(*this, *context(ed));
+    }
+};
+
+//! fold the tree and deallocate the task
+template<typename Range, typename Body, typename Partitioner>
+void start_for<Range, Body, Partitioner>::finalize(const execution_data& ed) {
+    // Get the current parent and allocator an object destruction
+    node* parent = my_parent;
+    auto allocator = my_allocator;
+    // Task execution finished - destroy it
+    this->~start_for();
+    // Unwind the tree decrementing the parent`s reference count
+
+    fold_tree<tree_node>(parent, ed);
+    allocator.deallocate(this, ed);
+
+}
+
+//! execute task for parallel_for
+template<typename Range, typename Body, typename Partitioner>
+task* start_for<Range, Body, Partitioner>::execute(execution_data& ed) {
+    if (!is_same_affinity(ed)) {
+        my_partition.note_affinity(execution_slot(ed));
+    }
+    my_partition.check_being_stolen(*this, ed);
+    my_partition.execute(*this, my_range, ed);
+    finalize(ed);
+    return nullptr;
+}
+
+//! cancel task for parallel_for
+template<typename Range, typename Body, typename Partitioner>
+task* start_for<Range, Body, Partitioner>::cancel(execution_data& ed) {
+    finalize(ed);
+    return nullptr;
+}
+
+//! Calls the function with values from range [begin, end) with a step provided
+template<typename Function, typename Index>
+class parallel_for_body : detail::no_assign {
+    const Function &my_func;
+    const Index my_begin;
+    const Index my_step;
+public:
+    parallel_for_body( const Function& _func, Index& _begin, Index& _step )
+        : my_func(_func), my_begin(_begin), my_step(_step) {}
+
+    void operator()( const blocked_range<Index>& r ) const {
+        // A set of local variables to help the compiler with vectorization of the following loop.
+        Index b = r.begin();
+        Index e = r.end();
+        Index ms = my_step;
+        Index k = my_begin + b*ms;
+
+#if __INTEL_COMPILER
+#pragma ivdep
+#if __TBB_ASSERT_ON_VECTORIZATION_FAILURE
+#pragma vector always assert
+#endif
+#endif
+        for ( Index i = b; i < e; ++i, k += ms ) {
+            my_func( k );
+        }
+    }
+};
+
+// Requirements on Range concept are documented in blocked_range.h
+
+/** \page parallel_for_body_req Requirements on parallel_for body
+    Class \c Body implementing the concept of parallel_for body must define:
+    - \code Body::Body( const Body& ); \endcode                 Copy constructor
+    - \code Body::~Body(); \endcode                             Destructor
+    - \code void Body::operator()( Range& r ) const; \endcode   Function call operator applying the body to range \c r.
+**/
+
+/** \name parallel_for
+    See also requirements on \ref range_req "Range" and \ref parallel_for_body_req "parallel_for Body". **/
+//@{
+
+//! Parallel iteration over range with default partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_for( const Range& range, const Body& body ) {
+    start_for<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run(range,body,__TBB_DEFAULT_PARTITIONER());
+}
+
+//! Parallel iteration over range with simple partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner ) {
+    start_for<Range,Body,const simple_partitioner>::run(range,body,partitioner);
+}
+
+//! Parallel iteration over range with auto_partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner ) {
+    start_for<Range,Body,const auto_partitioner>::run(range,body,partitioner);
+}
+
+//! Parallel iteration over range with static_partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner ) {
+    start_for<Range,Body,const static_partitioner>::run(range,body,partitioner);
+}
+
+//! Parallel iteration over range with affinity_partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner ) {
+    start_for<Range,Body,affinity_partitioner>::run(range,body,partitioner);
+}
+
+//! Parallel iteration over range with default partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_for( const Range& range, const Body& body, task_group_context& context ) {
+    start_for<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run(range, body, __TBB_DEFAULT_PARTITIONER(), context);
+}
+
+//! Parallel iteration over range with simple partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner, task_group_context& context ) {
+    start_for<Range,Body,const simple_partitioner>::run(range, body, partitioner, context);
+}
+
+//! Parallel iteration over range with auto_partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner, task_group_context& context ) {
+    start_for<Range,Body,const auto_partitioner>::run(range, body, partitioner, context);
+}
+
+//! Parallel iteration over range with static_partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner, task_group_context& context ) {
+    start_for<Range,Body,const static_partitioner>::run(range, body, partitioner, context);
+}
+
+//! Parallel iteration over range with affinity_partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner, task_group_context& context ) {
+    start_for<Range,Body,affinity_partitioner>::run(range,body,partitioner, context);
+}
+
+//! Implementation of parallel iteration over stepped range of integers with explicit step and partitioner
+template <typename Index, typename Function, typename Partitioner>
+void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner) {
+    if (step <= 0 )
+        throw_exception(exception_id::nonpositive_step); // throws std::invalid_argument
+    else if (last > first) {
+        // Above "else" avoids "potential divide by zero" warning on some platforms
+        Index end = (last - first - Index(1)) / step + Index(1);
+        blocked_range<Index> range(static_cast<Index>(0), end);
+        parallel_for_body<Function, Index> body(f, first, step);
+        parallel_for(range, body, partitioner);
+    }
+}
+
+//! Parallel iteration over a range of integers with a step provided and default partitioner
+template <typename Index, typename Function>
+void parallel_for(Index first, Index last, Index step, const Function& f) {
+    parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, auto_partitioner());
+}
+//! Parallel iteration over a range of integers with a step provided and simple partitioner
+template <typename Index, typename Function>
+void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner) {
+    parallel_for_impl<Index,Function,const simple_partitioner>(first, last, step, f, partitioner);
+}
+//! Parallel iteration over a range of integers with a step provided and auto partitioner
+template <typename Index, typename Function>
+void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner) {
+    parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, partitioner);
+}
+//! Parallel iteration over a range of integers with a step provided and static partitioner
+template <typename Index, typename Function>
+void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner) {
+    parallel_for_impl<Index,Function,const static_partitioner>(first, last, step, f, partitioner);
+}
+//! Parallel iteration over a range of integers with a step provided and affinity partitioner
+template <typename Index, typename Function>
+void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner) {
+    parallel_for_impl(first, last, step, f, partitioner);
+}
+
+//! Parallel iteration over a range of integers with a default step value and default partitioner
+template <typename Index, typename Function>
+void parallel_for(Index first, Index last, const Function& f) {
+    parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, auto_partitioner());
+}
+//! Parallel iteration over a range of integers with a default step value and simple partitioner
+template <typename Index, typename Function>
+void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner) {
+    parallel_for_impl<Index,Function,const simple_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
+}
+//! Parallel iteration over a range of integers with a default step value and auto partitioner
+template <typename Index, typename Function>
+void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner) {
+    parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
+}
+//! Parallel iteration over a range of integers with a default step value and static partitioner
+template <typename Index, typename Function>
+void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner) {
+    parallel_for_impl<Index,Function,const static_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
+}
+//! Parallel iteration over a range of integers with a default step value and affinity partitioner
+template <typename Index, typename Function>
+void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner) {
+    parallel_for_impl(first, last, static_cast<Index>(1), f, partitioner);
+}
+
+//! Implementation of parallel iteration over stepped range of integers with explicit step, task group context, and partitioner
+template <typename Index, typename Function, typename Partitioner>
+void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner, task_group_context &context) {
+    if (step <= 0 )
+        throw_exception(exception_id::nonpositive_step); // throws std::invalid_argument
+    else if (last > first) {
+        // Above "else" avoids "potential divide by zero" warning on some platforms
+        Index end = (last - first - Index(1)) / step + Index(1);
+        blocked_range<Index> range(static_cast<Index>(0), end);
+        parallel_for_body<Function, Index> body(f, first, step);
+        parallel_for(range, body, partitioner, context);
+    }
+}
+
+//! Parallel iteration over a range of integers with explicit step, task group context, and default partitioner
+template <typename Index, typename Function>
+void parallel_for(Index first, Index last, Index step, const Function& f, task_group_context &context) {
+    parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, auto_partitioner(), context);
+}
+//! Parallel iteration over a range of integers with explicit step, task group context, and simple partitioner
+template <typename Index, typename Function>
+void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner, task_group_context &context) {
+    parallel_for_impl<Index,Function,const simple_partitioner>(first, last, step, f, partitioner, context);
+}
+//! Parallel iteration over a range of integers with explicit step, task group context, and auto partitioner
+template <typename Index, typename Function>
+void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner, task_group_context &context) {
+    parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, partitioner, context);
+}
+//! Parallel iteration over a range of integers with explicit step, task group context, and static partitioner
+template <typename Index, typename Function>
+void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner, task_group_context &context) {
+    parallel_for_impl<Index,Function,const static_partitioner>(first, last, step, f, partitioner, context);
+}
+//! Parallel iteration over a range of integers with explicit step, task group context, and affinity partitioner
+template <typename Index, typename Function>
+void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner, task_group_context &context) {
+    parallel_for_impl(first, last, step, f, partitioner, context);
+}
+
+//! Parallel iteration over a range of integers with a default step value, explicit task group context, and default partitioner
+template <typename Index, typename Function>
+void parallel_for(Index first, Index last, const Function& f, task_group_context &context) {
+    parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, auto_partitioner(), context);
+}
+//! Parallel iteration over a range of integers with a default step value, explicit task group context, and simple partitioner
+template <typename Index, typename Function>
+void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner, task_group_context &context) {
+    parallel_for_impl<Index,Function,const simple_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
+}
+//! Parallel iteration over a range of integers with a default step value, explicit task group context, and auto partitioner
+template <typename Index, typename Function>
+void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner, task_group_context &context) {
+    parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
+}
+//! Parallel iteration over a range of integers with a default step value, explicit task group context, and static partitioner
+template <typename Index, typename Function>
+void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner, task_group_context &context) {
+    parallel_for_impl<Index,Function,const static_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
+}
+//! Parallel iteration over a range of integers with a default step value, explicit task group context, and affinity_partitioner
+template <typename Index, typename Function>
+void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner, task_group_context &context) {
+    parallel_for_impl(first, last, static_cast<Index>(1), f, partitioner, context);
+}
+// @}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::parallel_for;
+// Split types
+using detail::split;
+using detail::proportional_split;
+} // namespace v1
+
+} // namespace tbb
+
+#endif /* __TBB_parallel_for_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/parallel_for_each.h b/contrib/libs/tbb/include/oneapi/tbb/parallel_for_each.h
new file mode 100644
index 0000000000..563e00f5fc
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/parallel_for_each.h
@@ -0,0 +1,644 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_parallel_for_each_H
+#define __TBB_parallel_for_each_H
+
+#include "detail/_config.h"
+#include "detail/_namespace_injection.h"
+#include "detail/_exception.h"
+#include "detail/_task.h"
+#include "detail/_aligned_space.h"
+#include "detail/_small_object_pool.h"
+
+#include "parallel_for.h"
+#include "task_group.h" // task_group_context
+
+#include <iterator>
+#include <type_traits>
+
+namespace tbb {
+namespace detail {
+namespace d2 {
+template<typename Body, typename Item> class feeder_impl;
+} // namespace d2
+
+namespace d1 {
+//! Class the user supplied algorithm body uses to add new tasks
+template<typename Item>
+class feeder {
+    feeder() {}
+    feeder(const feeder&) = delete;
+    void operator=( const feeder&) = delete;
+
+    virtual ~feeder () {}
+    virtual void internal_add_copy(const Item& item) = 0;
+    virtual void internal_add_move(Item&& item) = 0;
+
+    template<typename Body_, typename Item_> friend class detail::d2::feeder_impl;
+public:
+    //! Add a work item to a running parallel_for_each.
+    void add(const Item& item) {internal_add_copy(item);}
+    void add(Item&& item) {internal_add_move(std::move(item));}
+};
+
+} // namespace d1
+
+namespace d2 {
+using namespace tbb::detail::d1;
+/** Selects one of the two possible forms of function call member operator.
+    @ingroup algorithms **/
+template<class Body>
+struct parallel_for_each_operator_selector {
+public:
+    template<typename ItemArg, typename FeederArg>
+    static auto call(const Body& body, ItemArg&& item, FeederArg*)
+    -> decltype(body(std::forward<ItemArg>(item)), void()) {
+        #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+        // Suppression of Microsoft non-standard extension warnings
+        #pragma warning (push)
+        #pragma warning (disable: 4239)
+        #endif
+
+        body(std::forward<ItemArg>(item));
+
+        #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+        #pragma warning (push)
+        #endif
+    }
+
+    template<typename ItemArg, typename FeederArg>
+    static auto call(const Body& body, ItemArg&& item, FeederArg* feeder)
+    -> decltype(body(std::forward<ItemArg>(item), *feeder), void()) {
+        #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+        // Suppression of Microsoft non-standard extension warnings
+        #pragma warning (push)
+        #pragma warning (disable: 4239)
+        #endif
+        __TBB_ASSERT(feeder, "Feeder was not created but should be");
+
+        body(std::forward<ItemArg>(item), *feeder);
+
+        #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+        #pragma warning (push)
+        #endif
+    }
+};
+
+template<typename Body, typename Item>
+struct feeder_item_task: public task {
+    using feeder_type = feeder_impl<Body, Item>;
+
+    template <typename ItemType>
+    feeder_item_task(ItemType&& input_item, feeder_type& feeder, small_object_allocator& alloc) :
+        item(std::forward<ItemType>(input_item)),
+        my_feeder(feeder),
+        my_allocator(alloc)
+    {}
+
+    void finalize(const execution_data& ed) {
+        my_feeder.my_wait_context.release();
+        my_allocator.delete_object(this, ed);
+    }
+
+    //! Hack for resolve ambiguity between calls to the body with and without moving the stored copy
+    //! Executing body with moving the copy should have higher priority
+    using first_priority = int;
+    using second_priority = double;
+
+    template <typename BodyType, typename ItemType, typename FeederType>
+    static auto call(const BodyType& call_body, ItemType& call_item, FeederType& call_feeder, first_priority)
+    -> decltype(parallel_for_each_operator_selector<Body>::call(call_body, std::move(call_item), &call_feeder), void())
+    {
+        parallel_for_each_operator_selector<Body>::call(call_body, std::move(call_item), &call_feeder);
+    }
+
+    template <typename BodyType, typename ItemType, typename FeederType>
+    static void call(const BodyType& call_body, ItemType& call_item, FeederType& call_feeder, second_priority) {
+        parallel_for_each_operator_selector<Body>::call(call_body, call_item, &call_feeder);
+    }
+
+    task* execute(execution_data& ed) override {
+        call(my_feeder.my_body, item, my_feeder, first_priority{});
+        finalize(ed);
+        return nullptr;
+    }
+
+    task* cancel(execution_data& ed) override {
+        finalize(ed);
+        return nullptr;
+    }
+
+    Item item;
+    feeder_type& my_feeder;
+    small_object_allocator my_allocator;
+}; // class feeder_item_task
+
+/** Implements new task adding procedure.
+    @ingroup algorithms **/
+template<typename Body, typename Item>
+class feeder_impl : public feeder<Item> {
+    // Avoiding use of copy constructor in a virtual method if the type does not support it
+    void internal_add_copy_impl(std::true_type, const Item& item) {
+        using feeder_task = feeder_item_task<Body, Item>;
+        small_object_allocator alloc;
+        auto task = alloc.new_object<feeder_task>(item, *this, alloc);
+
+        my_wait_context.reserve();
+        spawn(*task, my_execution_context);
+    }
+
+    void internal_add_copy_impl(std::false_type, const Item&) {
+        __TBB_ASSERT(false, "Overloading for r-value reference doesn't work or it's not movable and not copyable object");
+    }
+
+    void internal_add_copy(const Item& item) override {
+        internal_add_copy_impl(typename std::is_copy_constructible<Item>::type(), item);
+    }
+
+    void internal_add_move(Item&& item) override {
+        using feeder_task = feeder_item_task<Body, Item>;
+        small_object_allocator alloc{};
+        auto task = alloc.new_object<feeder_task>(std::move(item), *this, alloc);
+
+        my_wait_context.reserve();
+        spawn(*task, my_execution_context);
+    }
+public:
+    feeder_impl(const Body& body, wait_context& w_context, task_group_context &context)
+      : my_body(body),
+        my_wait_context(w_context)
+      , my_execution_context(context)
+    {}
+
+    const Body& my_body;
+    wait_context& my_wait_context;
+    task_group_context& my_execution_context;
+}; // class feeder_impl
+
+/** Execute computation under one element of the range
+    @ingroup algorithms **/
+template<typename Iterator, typename Body, typename Item>
+struct for_each_iteration_task: public task {
+    using feeder_type = feeder_impl<Body, Item>;
+
+    for_each_iteration_task(Iterator input_item_ptr, const Body& body, feeder_impl<Body, Item>* feeder_ptr, wait_context& wait_context) :
+        item_ptr(input_item_ptr), my_body(body), my_feeder_ptr(feeder_ptr), parent_wait_context(wait_context)
+    {}
+
+    void finalize() {
+        parent_wait_context.release();
+    }
+
+    task* execute(execution_data&) override {
+        parallel_for_each_operator_selector<Body>::call(my_body, *item_ptr, my_feeder_ptr);
+        finalize();
+        return nullptr;
+    }
+
+    task* cancel(execution_data&) override {
+        finalize();
+        return nullptr;
+    }
+
+    Iterator item_ptr;
+    const Body& my_body;
+    feeder_impl<Body, Item>* my_feeder_ptr;
+    wait_context& parent_wait_context;
+}; // class for_each_iteration_task
+
+// Helper to get the type of the iterator to the internal sequence of copies
+// If the element can be passed to the body as an rvalue - this iterator should be move_iterator
+template <typename Body, typename Item, typename = void>
+struct input_iteration_task_iterator_helper {
+    // For input iterators we pass const lvalue reference to the body
+    // It is prohibited to take non-constant lvalue references for input iterators
+    using type = const Item*;
+};
+
+template <typename Body, typename Item>
+struct input_iteration_task_iterator_helper<Body, Item,
+    tbb::detail::void_t<decltype(parallel_for_each_operator_selector<Body>::call(std::declval<const Body&>(),
+                                                                                 std::declval<Item&&>(),
+                                                                                 std::declval<feeder_impl<Body, Item>*>()))>>
+{
+    using type = std::move_iterator<Item*>;
+};
+
+/** Split one block task to several(max_block_size) iteration tasks for input iterators
+    @ingroup algorithms **/
+template <typename Body, typename Item>
+struct input_block_handling_task : public task {
+    static constexpr size_t max_block_size = 4;
+
+    using feeder_type = feeder_impl<Body, Item>;
+    using iteration_task_iterator_type = typename input_iteration_task_iterator_helper<Body, Item>::type;
+    using iteration_task = for_each_iteration_task<iteration_task_iterator_type, Body, Item>;
+
+    input_block_handling_task(wait_context& root_wait_context, task_group_context& e_context,
+                              const Body& body, feeder_impl<Body, Item>* feeder_ptr, small_object_allocator& alloc)
+        :my_size(0), my_wait_context(0), my_root_wait_context(root_wait_context),
+         my_execution_context(e_context), my_allocator(alloc)
+    {
+        auto item_it = block_iteration_space.begin();
+        for (auto* it = task_pool.begin(); it != task_pool.end(); ++it) {
+            new (it) iteration_task(iteration_task_iterator_type(item_it++), body, feeder_ptr, my_wait_context);
+        }
+    }
+
+    void finalize(const execution_data& ed) {
+        my_root_wait_context.release();
+        my_allocator.delete_object(this, ed);
+    }
+
+    task* execute(execution_data& ed) override {
+        __TBB_ASSERT( my_size > 0, "Negative size was passed to task");
+        for (std::size_t counter = 1; counter < my_size; ++counter) {
+            my_wait_context.reserve();
+            spawn(*(task_pool.begin() + counter), my_execution_context);
+        }
+        my_wait_context.reserve();
+        execute_and_wait(*task_pool.begin(), my_execution_context,
+                         my_wait_context,    my_execution_context);
+
+        // deallocate current task after children execution
+        finalize(ed);
+        return nullptr;
+    }
+
+    task* cancel(execution_data& ed) override {
+        finalize(ed);
+        return nullptr;
+    }
+
+    ~input_block_handling_task() {
+        for(std::size_t counter = 0; counter < max_block_size; ++counter) {
+            (task_pool.begin() + counter)->~iteration_task();
+            (block_iteration_space.begin() + counter)->~Item();
+        }
+    }
+
+    aligned_space<Item, max_block_size> block_iteration_space;
+    aligned_space<iteration_task, max_block_size> task_pool;
+    std::size_t my_size;
+    wait_context my_wait_context;
+    wait_context& my_root_wait_context;
+    task_group_context& my_execution_context;
+    small_object_allocator my_allocator;
+}; // class input_block_handling_task
+
+/** Split one block task to several(max_block_size) iteration tasks for forward iterators
+    @ingroup algorithms **/
+template <typename Iterator, typename Body, typename Item>
+struct forward_block_handling_task : public task {
+    static constexpr size_t max_block_size = 4;
+
+    using iteration_task = for_each_iteration_task<Iterator, Body, Item>;
+
+    forward_block_handling_task(Iterator first, std::size_t size,
+                                wait_context& w_context, task_group_context& e_context,
+                                const Body& body, feeder_impl<Body, Item>* feeder_ptr,
+                                small_object_allocator& alloc)
+        : my_size(size), my_wait_context(0), my_root_wait_context(w_context),
+          my_execution_context(e_context), my_allocator(alloc)
+    {
+        auto* task_it = task_pool.begin();
+        for (std::size_t i = 0; i < size; i++) {
+            new (task_it++) iteration_task(first, body, feeder_ptr, my_wait_context);
+            ++first;
+        }
+    }
+
+    void finalize(const execution_data& ed) {
+        my_root_wait_context.release();
+        my_allocator.delete_object(this, ed);
+    }
+
+    task* execute(execution_data& ed) override {
+        __TBB_ASSERT( my_size > 0, "Negative size was passed to task");
+        for(std::size_t counter = 1; counter < my_size; ++counter) {
+            my_wait_context.reserve();
+            spawn(*(task_pool.begin() + counter), my_execution_context);
+        }
+        my_wait_context.reserve();
+        execute_and_wait(*task_pool.begin(), my_execution_context,
+                         my_wait_context,    my_execution_context);
+
+        // deallocate current task after children execution
+        finalize(ed);
+        return nullptr;
+    }
+
+    task* cancel(execution_data& ed) override {
+        finalize(ed);
+        return nullptr;
+    }
+
+    ~forward_block_handling_task() {
+        for(std::size_t counter = 0; counter < my_size; ++counter) {
+            (task_pool.begin() + counter)->~iteration_task();
+        }
+    }
+
+    aligned_space<iteration_task, max_block_size> task_pool;
+    std::size_t my_size;
+    wait_context my_wait_context;
+    wait_context& my_root_wait_context;
+    task_group_context& my_execution_context;
+    small_object_allocator my_allocator;
+}; // class forward_block_handling_task
+
+/** Body for parallel_for algorithm.
+  * Allows to redirect operations under random access iterators range to the parallel_for algorithm.
+    @ingroup algorithms **/
+template <typename Iterator, typename Body, typename Item>
+class parallel_for_body_wrapper {
+    Iterator my_first;
+    const Body& my_body;
+    feeder_impl<Body, Item>* my_feeder_ptr;
+public:
+    parallel_for_body_wrapper(Iterator first, const Body& body, feeder_impl<Body, Item>* feeder_ptr)
+        : my_first(first), my_body(body), my_feeder_ptr(feeder_ptr) {}
+
+    void operator()(tbb::blocked_range<std::size_t> range) const {
+#if __INTEL_COMPILER
+#pragma ivdep
+#endif
+        for (std::size_t count = range.begin(); count != range.end(); count++) {
+            parallel_for_each_operator_selector<Body>::call(my_body, *(my_first + count),
+                                                            my_feeder_ptr);
+        }
+    }
+}; // class parallel_for_body_wrapper
+
+
+/** Helper for getting iterators tag including inherited custom tags
+    @ingroup algorithms */
+template<typename It>
+using tag = typename std::iterator_traits<It>::iterator_category;
+
+template<typename It>
+using iterator_tag_dispatch = typename
+    std::conditional<
+        std::is_base_of<std::random_access_iterator_tag, tag<It>>::value,
+        std::random_access_iterator_tag,
+        typename std::conditional<
+            std::is_base_of<std::forward_iterator_tag, tag<It>>::value,
+            std::forward_iterator_tag,
+            std::input_iterator_tag
+        >::type
+    >::type;
+
+template <typename Body, typename Iterator, typename Item>
+using feeder_is_required = tbb::detail::void_t<decltype(std::declval<const Body>()(std::declval<typename std::iterator_traits<Iterator>::reference>(),
+                                                                                   std::declval<feeder<Item>&>()))>;
+
+// Creates feeder object only if the body can accept it
+template <typename Iterator, typename Body, typename Item, typename = void>
+struct feeder_holder {
+    feeder_holder( wait_context&, task_group_context&, const Body& ) {}
+
+    feeder_impl<Body, Item>* feeder_ptr() { return nullptr; }
+}; // class feeder_holder
+
+template <typename Iterator, typename Body, typename Item>
+class feeder_holder<Iterator, Body, Item, feeder_is_required<Body, Iterator, Item>> {
+public:
+    feeder_holder( wait_context& w_context, task_group_context& context, const Body& body )
+        : my_feeder(body, w_context, context) {}
+
+    feeder_impl<Body, Item>* feeder_ptr() { return &my_feeder; }
+private:
+    feeder_impl<Body, Item> my_feeder;
+}; // class feeder_holder
+
+template <typename Iterator, typename Body, typename Item>
+class for_each_root_task_base : public task {
+public:
+    for_each_root_task_base(Iterator first, Iterator last, const Body& body, wait_context& w_context, task_group_context& e_context)
+        : my_first(first), my_last(last), my_wait_context(w_context), my_execution_context(e_context),
+          my_body(body), my_feeder_holder(my_wait_context, my_execution_context, my_body)
+    {
+        my_wait_context.reserve();
+    }
+private:
+    task* cancel(execution_data&) override {
+        this->my_wait_context.release();
+        return nullptr;
+    }
+protected:
+    Iterator my_first;
+    Iterator my_last;
+    wait_context& my_wait_context;
+    task_group_context& my_execution_context;
+    const Body& my_body;
+    feeder_holder<Iterator, Body, Item> my_feeder_holder;
+}; // class for_each_root_task_base
+
+/** parallel_for_each algorithm root task - most generic version
+  * Splits input range to blocks
+    @ingroup algorithms **/
+template <typename Iterator, typename Body, typename Item, typename IteratorTag = iterator_tag_dispatch<Iterator>>
+class for_each_root_task : public for_each_root_task_base<Iterator, Body, Item>
+{
+    using base_type = for_each_root_task_base<Iterator, Body, Item>;
+public:
+    using base_type::base_type;
+private:
+    task* execute(execution_data& ed) override {
+        using block_handling_type = input_block_handling_task<Body, Item>;
+
+        if (this->my_first == this->my_last) {
+            this->my_wait_context.release();
+            return nullptr;
+        }
+
+        this->my_wait_context.reserve();
+        small_object_allocator alloc{};
+        auto block_handling_task = alloc.new_object<block_handling_type>(ed, this->my_wait_context, this->my_execution_context,
+                                                                         this->my_body, this->my_feeder_holder.feeder_ptr(),
+                                                                         alloc);
+
+        auto* block_iterator = block_handling_task->block_iteration_space.begin();
+        for (; !(this->my_first == this->my_last) && block_handling_task->my_size < block_handling_type::max_block_size; ++this->my_first) {
+            // Move semantics are automatically used when supported by the iterator
+            new (block_iterator++) Item(*this->my_first);
+            ++block_handling_task->my_size;
+        }
+
+        // Do not access this after spawn to avoid races
+        spawn(*this, this->my_execution_context);
+        return block_handling_task;
+    }
+}; // class for_each_root_task - most generic implementation
+
+/** parallel_for_each algorithm root task - forward iterator based specialization
+  * Splits input range to blocks
+    @ingroup algorithms **/
+template <typename Iterator, typename Body, typename Item>
+class for_each_root_task<Iterator, Body, Item, std::forward_iterator_tag>
+    : public for_each_root_task_base<Iterator, Body, Item>
+{
+    using base_type = for_each_root_task_base<Iterator, Body, Item>;
+public:
+    using base_type::base_type;
+private:
+    task* execute(execution_data& ed) override {
+        using block_handling_type = forward_block_handling_task<Iterator, Body, Item>;
+        if (this->my_first == this->my_last) {
+            this->my_wait_context.release();
+            return nullptr;
+        }
+
+        std::size_t block_size{0};
+        Iterator first_block_element = this->my_first;
+        for (; !(this->my_first == this->my_last) && block_size < block_handling_type::max_block_size; ++this->my_first) {
+            ++block_size;
+        }
+
+        this->my_wait_context.reserve();
+        small_object_allocator alloc{};
+        auto block_handling_task = alloc.new_object<block_handling_type>(ed, first_block_element, block_size,
+                                                                         this->my_wait_context, this->my_execution_context,
+                                                                         this->my_body, this->my_feeder_holder.feeder_ptr(), alloc);
+
+        // Do not access this after spawn to avoid races
+        spawn(*this, this->my_execution_context);
+        return block_handling_task;
+    }
+}; // class for_each_root_task - forward iterator based specialization
+
+/** parallel_for_each algorithm root task - random access iterator based specialization
+  * Splits input range to blocks
+    @ingroup algorithms **/
+template <typename Iterator, typename Body, typename Item>
+class for_each_root_task<Iterator, Body, Item, std::random_access_iterator_tag>
+    : public for_each_root_task_base<Iterator, Body, Item>
+{
+    using base_type = for_each_root_task_base<Iterator, Body, Item>;
+public:
+    using base_type::base_type;
+private:
+    task* execute(execution_data&) override {
+        tbb::parallel_for(
+            tbb::blocked_range<std::size_t>(0, std::distance(this->my_first, this->my_last)),
+            parallel_for_body_wrapper<Iterator, Body, Item>(this->my_first, this->my_body, this->my_feeder_holder.feeder_ptr())
+            , this->my_execution_context
+        );
+
+        this->my_wait_context.release();
+        return nullptr;
+    }
+}; // class for_each_root_task - random access iterator based specialization
+
+/** Helper for getting item type. If item type can be deduced from feeder - got it from feeder,
+    if feeder is generic - got item type from range.
+    @ingroup algorithms */
+template<typename Body, typename Item, typename FeederArg>
+auto feeder_argument_parser(void (Body::*)(Item, feeder<FeederArg>&) const) -> FeederArg;
+
+template<typename Body, typename>
+decltype(feeder_argument_parser<Body>(&Body::operator())) get_item_type_impl(int); // for (T, feeder<T>)
+template<typename Body, typename Item> Item get_item_type_impl(...); // stub
+
+template <typename Body, typename Item>
+using get_item_type = decltype(get_item_type_impl<Body, Item>(0));
+
+/** Implements parallel iteration over a range.
+    @ingroup algorithms */
+template<typename Iterator, typename Body>
+void run_parallel_for_each( Iterator first, Iterator last, const Body& body, task_group_context& context)
+{
+    if (!(first == last)) {
+        using ItemType = get_item_type<Body, typename std::iterator_traits<Iterator>::value_type>;
+        wait_context w_context(0);
+
+        for_each_root_task<Iterator, Body, ItemType> root_task(first, last, body, w_context, context);
+
+        execute_and_wait(root_task, context, w_context, context);
+    }
+}
+
+/** \page parallel_for_each_body_req Requirements on parallel_for_each body
+    Class \c Body implementing the concept of parallel_for_each body must define:
+    - \code
+        B::operator()(
+                cv_item_type item,
+                feeder<item_type>& feeder
+        ) const
+
+        OR
+
+        B::operator()( cv_item_type& item ) const
+      \endcode                                               Process item.
+                                                             May be invoked concurrently  for the same \c this but different \c item.
+
+    - \code item_type( const item_type& ) \endcode
+                                                             Copy a work item.
+    - \code ~item_type() \endcode                            Destroy a work item
+**/
+
+/** \name parallel_for_each
+    See also requirements on \ref parallel_for_each_body_req "parallel_for_each Body". **/
+//@{
+//! Parallel iteration over a range, with optional addition of more work.
+/** @ingroup algorithms */
+template<typename Iterator, typename Body>
+void parallel_for_each(Iterator first, Iterator last, const Body& body) {
+    task_group_context context(PARALLEL_FOR_EACH);
+    run_parallel_for_each<Iterator, Body>(first, last, body, context);
+}
+
+template<typename Range, typename Body>
+void parallel_for_each(Range& rng, const Body& body) {
+    parallel_for_each(std::begin(rng), std::end(rng), body);
+}
+
+template<typename Range, typename Body>
+void parallel_for_each(const Range& rng, const Body& body) {
+    parallel_for_each(std::begin(rng), std::end(rng), body);
+}
+
+//! Parallel iteration over a range, with optional addition of more work and user-supplied context
+/** @ingroup algorithms */
+template<typename Iterator, typename Body>
+void parallel_for_each(Iterator first, Iterator last, const Body& body, task_group_context& context) {
+    run_parallel_for_each<Iterator, Body>(first, last, body, context);
+}
+
+template<typename Range, typename Body>
+void parallel_for_each(Range& rng, const Body& body, task_group_context& context) {
+    parallel_for_each(std::begin(rng), std::end(rng), body, context);
+}
+
+template<typename Range, typename Body>
+void parallel_for_each(const Range& rng, const Body& body, task_group_context& context) {
+    parallel_for_each(std::begin(rng), std::end(rng), body, context);
+}
+
+} // namespace d2
+} // namespace detail
+//! @endcond
+//@}
+
+inline namespace v1 {
+using detail::d2::parallel_for_each;
+using detail::d1::feeder;
+} // namespace v1
+
+} // namespace tbb
+
+#endif /* __TBB_parallel_for_each_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/parallel_invoke.h b/contrib/libs/tbb/include/oneapi/tbb/parallel_invoke.h
new file mode 100644
index 0000000000..6eb0f2e530
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/parallel_invoke.h
@@ -0,0 +1,227 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_parallel_invoke_H
+#define __TBB_parallel_invoke_H
+
+#include "detail/_config.h"
+#include "detail/_namespace_injection.h"
+#include "detail/_exception.h"
+#include "detail/_task.h"
+#include "detail/_template_helpers.h"
+#include "detail/_small_object_pool.h"
+
+#include "task_group.h"
+
+#include <tuple>
+#include <atomic>
+#include <utility>
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! Simple task object, executing user method
+template<typename Function, typename WaitObject>
+struct function_invoker : public task {
+    function_invoker(const Function& function, WaitObject& wait_ctx) :
+        my_function(function),
+        parent_wait_ctx(wait_ctx)
+    {}
+
+    task* execute(execution_data& ed) override {
+        my_function();
+        parent_wait_ctx.release(ed);
+        call_itt_task_notify(destroy, this);
+        return nullptr;
+    }
+
+    task* cancel(execution_data& ed) override {
+        parent_wait_ctx.release(ed);
+        return nullptr;
+    }
+
+    const Function& my_function;
+    WaitObject& parent_wait_ctx;
+}; // struct function_invoker
+
+//! Task object for managing subroots in trinary task trees.
+// Endowed with additional synchronization logic (compatible with wait object intefaces) to support
+// continuation passing execution. This task spawns 2 function_invoker tasks with first and second functors
+// and then executes first functor by itself. But only the last executed functor must destruct and deallocate
+// the subroot task.
+template<typename F1, typename F2, typename F3>
+struct invoke_subroot_task : public task {
+    wait_context& root_wait_ctx;
+    std::atomic<unsigned> ref_count{0};
+    bool child_spawned = false;
+
+    const F1& self_invoked_functor;
+    function_invoker<F2, invoke_subroot_task<F1, F2, F3>> f2_invoker;
+    function_invoker<F3, invoke_subroot_task<F1, F2, F3>> f3_invoker;
+
+    task_group_context& my_execution_context;
+    small_object_allocator my_allocator;
+
+    invoke_subroot_task(const F1& f1, const F2& f2, const F3& f3, wait_context& wait_ctx, task_group_context& context,
+                 small_object_allocator& alloc) :
+        root_wait_ctx(wait_ctx),
+        self_invoked_functor(f1),
+        f2_invoker(f2, *this),
+        f3_invoker(f3, *this),
+        my_execution_context(context),
+        my_allocator(alloc)
+    {
+        root_wait_ctx.reserve();
+    }
+
+    void finalize(const execution_data& ed) {
+        root_wait_ctx.release();
+
+        my_allocator.delete_object(this, ed);
+    }
+
+    void release(const execution_data& ed) {
+        __TBB_ASSERT(ref_count > 0, nullptr);
+        call_itt_task_notify(releasing, this);
+        if( --ref_count == 0 ) {
+            call_itt_task_notify(acquired, this);
+            finalize(ed);
+        }
+    }
+
+    task* execute(execution_data& ed) override {
+        ref_count.fetch_add(3, std::memory_order_relaxed);
+        spawn(f3_invoker, my_execution_context);
+        spawn(f2_invoker, my_execution_context);
+        self_invoked_functor();
+
+        release(ed);
+        return nullptr;
+    }
+
+    task* cancel(execution_data& ed) override {
+        if( ref_count > 0 ) { // detect children spawn
+            release(ed);
+        } else {
+            finalize(ed);
+        }
+        return nullptr;
+    }
+}; // struct subroot_task
+
+class invoke_root_task {
+public:
+    invoke_root_task(wait_context& wc) : my_wait_context(wc) {}
+    void release(const execution_data&) {
+        my_wait_context.release();
+    }
+private:
+    wait_context& my_wait_context;
+};
+
+template<typename F1>
+void invoke_recursive_separation(wait_context& root_wait_ctx, task_group_context& context, const F1& f1) {
+    root_wait_ctx.reserve(1);
+    invoke_root_task root(root_wait_ctx);
+    function_invoker<F1, invoke_root_task> invoker1(f1, root);
+
+    execute_and_wait(invoker1, context, root_wait_ctx, context);
+}
+
+template<typename F1, typename F2>
+void invoke_recursive_separation(wait_context& root_wait_ctx, task_group_context& context, const F1& f1, const F2& f2) {
+    root_wait_ctx.reserve(2);
+    invoke_root_task root(root_wait_ctx);
+    function_invoker<F1, invoke_root_task> invoker1(f1, root);
+    function_invoker<F2, invoke_root_task> invoker2(f2, root);
+
+    spawn(invoker1, context);
+    execute_and_wait(invoker2, context, root_wait_ctx, context);
+}
+
+template<typename F1, typename F2, typename F3>
+void invoke_recursive_separation(wait_context& root_wait_ctx, task_group_context& context, const F1& f1, const F2& f2, const F3& f3) {
+    root_wait_ctx.reserve(3);
+    invoke_root_task root(root_wait_ctx);
+    function_invoker<F1, invoke_root_task> invoker1(f1, root);
+    function_invoker<F2, invoke_root_task> invoker2(f2, root);
+    function_invoker<F3, invoke_root_task> invoker3(f3, root);
+
+    //TODO: implement sub root for two tasks (measure performance)
+    spawn(invoker1, context);
+    spawn(invoker2, context);
+    execute_and_wait(invoker3, context, root_wait_ctx, context);
+}
+
+template<typename F1, typename F2, typename F3, typename... Fs>
+void invoke_recursive_separation(wait_context& root_wait_ctx, task_group_context& context,
+                                 const F1& f1, const F2& f2, const F3& f3, const Fs&... fs) {
+    small_object_allocator alloc{};
+    auto sub_root = alloc.new_object<invoke_subroot_task<F1, F2, F3>>(f1, f2, f3, root_wait_ctx, context, alloc);
+    spawn(*sub_root, context);
+
+    invoke_recursive_separation(root_wait_ctx, context, fs...);
+}
+
+template<typename... Fs>
+void parallel_invoke_impl(task_group_context& context, const Fs&... fs) {
+    static_assert(sizeof...(Fs) >= 2, "Parallel invoke may be called with at least two callable");
+    wait_context root_wait_ctx{0};
+
+    invoke_recursive_separation(root_wait_ctx, context, fs...);
+}
+
+template<typename F1, typename... Fs>
+void parallel_invoke_impl(const F1& f1, const Fs&... fs) {
+    static_assert(sizeof...(Fs) >= 1, "Parallel invoke may be called with at least two callable");
+    task_group_context context(PARALLEL_INVOKE);
+    wait_context root_wait_ctx{0};
+
+    invoke_recursive_separation(root_wait_ctx, context, fs..., f1);
+}
+
+//! Passes last argument of variadic pack as first for handling user provided task_group_context
+template <typename Tuple, typename... Fs>
+struct invoke_helper;
+
+template <typename... Args, typename T, typename... Fs>
+struct invoke_helper<std::tuple<Args...>, T, Fs...> : invoke_helper<std::tuple<Args..., T>, Fs...> {};
+
+template <typename... Fs, typename T/*task_group_context or callable*/>
+struct invoke_helper<std::tuple<Fs...>, T> {
+    void operator()(Fs&&... args, T&& t) {
+        parallel_invoke_impl(std::forward<T>(t), std::forward<Fs>(args)...);
+    }
+};
+
+//! Parallel execution of several function objects
+// We need to pass parameter pack through forwarding reference,
+// since this pack may contain task_group_context that must be passed via lvalue non-const reference
+template<typename... Fs>
+void parallel_invoke(Fs&&... fs) {
+    invoke_helper<std::tuple<>, Fs...>()(std::forward<Fs>(fs)...);
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::parallel_invoke;
+} // namespace v1
+
+} // namespace tbb
+#endif /* __TBB_parallel_invoke_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/parallel_pipeline.h b/contrib/libs/tbb/include/oneapi/tbb/parallel_pipeline.h
new file mode 100644
index 0000000000..87a159c925
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/parallel_pipeline.h
@@ -0,0 +1,153 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_parallel_pipeline_H
+#define __TBB_parallel_pipeline_H
+
+#include "detail/_pipeline_filters.h"
+#include "detail/_config.h"
+#include "detail/_namespace_injection.h"
+#include "task_group.h"
+
+#include <cstddef>
+#include <atomic>
+#include <type_traits>
+
+namespace tbb {
+namespace detail {
+
+namespace r1 {
+void __TBB_EXPORTED_FUNC parallel_pipeline(task_group_context&, std::size_t, const d1::filter_node&);
+}
+
+namespace d1 {
+
+enum class filter_mode : unsigned int
+{
+    //! processes multiple items in parallel and in no particular order
+    parallel = base_filter::filter_is_out_of_order,
+    //! processes items one at a time; all such filters process items in the same order
+    serial_in_order =  base_filter::filter_is_serial,
+    //! processes items one at a time and in no particular order
+    serial_out_of_order = base_filter::filter_is_serial | base_filter::filter_is_out_of_order
+};
+//! Class representing a chain of type-safe pipeline filters
+/** @ingroup algorithms */
+template<typename InputType, typename OutputType>
+class filter {
+    filter_node_ptr my_root;
+    filter( filter_node_ptr root ) : my_root(root) {}
+    friend void parallel_pipeline( size_t, const filter<void,void>&, task_group_context& );
+    template<typename T_, typename U_, typename Body>
+    friend filter<T_,U_> make_filter( filter_mode, const Body& );
+    template<typename T_, typename V_, typename U_>
+    friend filter<T_,U_> operator&( const filter<T_,V_>&, const filter<V_,U_>& );
+public:
+    filter() = default;
+    filter( const filter& rhs ) : my_root(rhs.my_root) {}
+    filter( filter&& rhs ) : my_root(std::move(rhs.my_root)) {}
+
+    void operator=(const filter& rhs) {
+        my_root = rhs.my_root;
+    }
+    void operator=( filter&& rhs ) {
+        my_root = std::move(rhs.my_root);
+    }
+
+    template<typename Body>
+    filter( filter_mode mode, const Body& body ) :
+        my_root( new(r1::allocate_memory(sizeof(filter_node_leaf<InputType, OutputType, Body>)))
+                    filter_node_leaf<InputType, OutputType, Body>(static_cast<unsigned int>(mode), body) ) {
+    }
+
+    filter& operator&=( const filter<OutputType,OutputType>& right ) {
+        *this = *this & right;
+        return *this;
+    }
+
+    void clear() {
+        // Like operator= with filter() on right side.
+        my_root = nullptr;
+    }
+};
+
+//! Create a filter to participate in parallel_pipeline
+/** @ingroup algorithms */
+template<typename InputType, typename OutputType, typename Body>
+filter<InputType, OutputType> make_filter( filter_mode mode, const Body& body ) {
+    return filter_node_ptr( new(r1::allocate_memory(sizeof(filter_node_leaf<InputType, OutputType, Body>)))
+                                filter_node_leaf<InputType, OutputType, Body>(static_cast<unsigned int>(mode), body) );
+}
+
+//! Create a filter to participate in parallel_pipeline
+/** @ingroup algorithms */
+template<typename Body>
+filter<filter_input<Body>, filter_output<Body>> make_filter( filter_mode mode, const Body& body ) {
+    return make_filter<filter_input<Body>, filter_output<Body>>(mode, body);
+}
+
+//! Composition of filters left and right.
+/** @ingroup algorithms */
+template<typename T, typename V, typename U>
+filter<T,U> operator&( const filter<T,V>& left, const filter<V,U>& right ) {
+    __TBB_ASSERT(left.my_root,"cannot use default-constructed filter as left argument of '&'");
+    __TBB_ASSERT(right.my_root,"cannot use default-constructed filter as right argument of '&'");
+    return filter_node_ptr( new (r1::allocate_memory(sizeof(filter_node))) filter_node(left.my_root,right.my_root) );
+}
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+template<typename Body>
+filter(filter_mode, Body)
+->filter<filter_input<Body>, filter_output<Body>>;
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+//! Parallel pipeline over chain of filters with user-supplied context.
+/** @ingroup algorithms **/
+inline void parallel_pipeline(size_t max_number_of_live_tokens, const filter<void,void>& filter_chain, task_group_context& context) {
+    r1::parallel_pipeline(context, max_number_of_live_tokens, *filter_chain.my_root);
+}
+
+//! Parallel pipeline over chain of filters.
+/** @ingroup algorithms **/
+inline void parallel_pipeline(size_t max_number_of_live_tokens, const filter<void,void>& filter_chain) {
+    task_group_context context;
+    parallel_pipeline(max_number_of_live_tokens, filter_chain, context);
+}
+
+//! Parallel pipeline over sequence of filters.
+/** @ingroup algorithms **/
+template<typename F1, typename F2, typename... FiltersContext>
+void parallel_pipeline(size_t max_number_of_live_tokens,
+                              const F1& filter1,
+                              const F2& filter2,
+                              FiltersContext&&... filters) {
+    parallel_pipeline(max_number_of_live_tokens, filter1 & filter2, std::forward<FiltersContext>(filters)...);
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1
+{
+using detail::d1::parallel_pipeline;
+using detail::d1::filter;
+using detail::d1::make_filter;
+using detail::d1::filter_mode;
+using detail::d1::flow_control;
+}
+} // tbb
+
+#endif /* __TBB_parallel_pipeline_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/parallel_reduce.h b/contrib/libs/tbb/include/oneapi/tbb/parallel_reduce.h
new file mode 100644
index 0000000000..6db6369d68
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/parallel_reduce.h
@@ -0,0 +1,689 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_parallel_reduce_H
+#define __TBB_parallel_reduce_H
+
+#include <new>
+#include "detail/_namespace_injection.h"
+#include "detail/_task.h"
+#include "detail/_aligned_space.h"
+#include "detail/_small_object_pool.h"
+
+#include "task_group.h" // task_group_context
+#include "partitioner.h"
+#include "profiling.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! Tree node type for parallel_reduce.
+/** @ingroup algorithms */
+//TODO: consider folding tree via bypass execution(instead of manual folding)
+// for better cancellation and critical tasks handling (performance measurements required).
+template<typename Body>
+struct reduction_tree_node : public tree_node {
+    tbb::detail::aligned_space<Body> zombie_space;
+    Body& left_body;
+    bool has_right_zombie{false};
+
+    reduction_tree_node(node* parent, int ref_count, Body& input_left_body, small_object_allocator& alloc) :
+        tree_node{parent, ref_count, alloc},
+        left_body(input_left_body) /* gcc4.8 bug - braced-initialization doesn't work for class members of reference type */
+    {}
+
+    void join(task_group_context* context) {
+        if (has_right_zombie && !context->is_group_execution_cancelled())
+            left_body.join(*zombie_space.begin());
+    }
+
+    ~reduction_tree_node() {
+        if( has_right_zombie ) zombie_space.begin()->~Body();
+    }
+};
+
+//! Task type used to split the work of parallel_reduce.
+/** @ingroup algorithms */
+template<typename Range, typename Body, typename Partitioner>
+struct start_reduce : public task {
+    Range my_range;
+    Body* my_body;
+    node* my_parent;
+
+    typename Partitioner::task_partition_type my_partition;
+    small_object_allocator my_allocator;
+    bool is_right_child;
+
+    task* execute(execution_data&) override;
+    task* cancel(execution_data&) override;
+    void finalize(const execution_data&);
+
+    using tree_node_type = reduction_tree_node<Body>;
+
+    //! Constructor reduce root task.
+    start_reduce( const Range& range, Body& body, Partitioner& partitioner, small_object_allocator& alloc ) :
+        my_range(range),
+        my_body(&body),
+        my_partition(partitioner),
+        my_allocator(alloc),
+        is_right_child(false) {}
+    //! Splitting constructor used to generate children.
+    /** parent_ becomes left child. Newly constructed object is right child. */
+    start_reduce( start_reduce& parent_, typename Partitioner::split_type& split_obj, small_object_allocator& alloc ) :
+        my_range(parent_.my_range, get_range_split_object<Range>(split_obj)),
+        my_body(parent_.my_body),
+        my_partition(parent_.my_partition, split_obj),
+        my_allocator(alloc),
+        is_right_child(true)
+    {
+        parent_.is_right_child = false;
+    }
+    //! Construct right child from the given range as response to the demand.
+    /** parent_ remains left child. Newly constructed object is right child. */
+    start_reduce( start_reduce& parent_, const Range& r, depth_t d, small_object_allocator& alloc ) :
+        my_range(r),
+        my_body(parent_.my_body),
+        my_partition(parent_.my_partition, split()),
+        my_allocator(alloc),
+        is_right_child(true)
+    {
+        my_partition.align_depth( d );
+        parent_.is_right_child = false;
+    }
+    static void run(const Range& range, Body& body, Partitioner& partitioner, task_group_context& context) {
+        if ( !range.empty() ) {
+            wait_node wn;
+            small_object_allocator alloc{};
+            auto reduce_task = alloc.new_object<start_reduce>(range, body, partitioner, alloc);
+            reduce_task->my_parent = &wn;
+            execute_and_wait(*reduce_task, context, wn.m_wait, context);
+        }
+    }
+    static void run(const Range& range, Body& body, Partitioner& partitioner) {
+        // Bound context prevents exceptions from body to affect nesting or sibling algorithms,
+        // and allows users to handle exceptions safely by wrapping parallel_reduce in the try-block.
+        task_group_context context(PARALLEL_REDUCE);
+        run(range, body, partitioner, context);
+    }
+    //! Run body for range, serves as callback for partitioner
+    void run_body( Range &r ) {
+        (*my_body)(r);
+    }
+
+    //! spawn right task, serves as callback for partitioner
+    void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) {
+        offer_work_impl(ed, *this, split_obj);
+    }
+    //! spawn right task, serves as callback for partitioner
+    void offer_work(const Range& r, depth_t d, execution_data& ed) {
+        offer_work_impl(ed, *this, r, d);
+    }
+
+private:
+    template <typename... Args>
+    void offer_work_impl(execution_data& ed, Args&&... args) {
+        small_object_allocator alloc{};
+        // New right child
+        auto right_child = alloc.new_object<start_reduce>(ed, std::forward<Args>(args)..., alloc);
+
+        // New root node as a continuation and ref count. Left and right child attach to the new parent.
+        right_child->my_parent = my_parent = alloc.new_object<tree_node_type>(ed, my_parent, 2, *my_body, alloc);
+
+        // Spawn the right sibling
+        right_child->spawn_self(ed);
+    }
+
+    void spawn_self(execution_data& ed) {
+        my_partition.spawn_task(*this, *context(ed));
+    }
+};
+
+//! fold the tree and deallocate the task
+template<typename Range, typename Body, typename Partitioner>
+void start_reduce<Range, Body, Partitioner>::finalize(const execution_data& ed) {
+    // Get the current parent and wait object before an object destruction
+    node* parent = my_parent;
+    auto allocator = my_allocator;
+    // Task execution finished - destroy it
+    this->~start_reduce();
+    // Unwind the tree decrementing the parent`s reference count
+    fold_tree<tree_node_type>(parent, ed);
+    allocator.deallocate(this, ed);
+}
+
+//! Execute parallel_reduce task
+template<typename Range, typename Body, typename Partitioner>
+task* start_reduce<Range,Body,Partitioner>::execute(execution_data& ed) {
+    if (!is_same_affinity(ed)) {
+        my_partition.note_affinity(execution_slot(ed));
+    }
+    my_partition.check_being_stolen(*this, ed);
+
+    // The acquire barrier synchronizes the data pointed with my_body if the left
+    // task has already finished.
+    if( is_right_child && my_parent->m_ref_count.load(std::memory_order_acquire) == 2 ) {
+        tree_node_type* parent_ptr = static_cast<tree_node_type*>(my_parent);
+        my_body = (Body*) new( parent_ptr->zombie_space.begin() ) Body(*my_body, split());
+        parent_ptr->has_right_zombie = true;
+    }
+    __TBB_ASSERT(my_body != nullptr, "Incorrect body value");
+
+    my_partition.execute(*this, my_range, ed);
+
+    finalize(ed);
+    return nullptr;
+}
+
+//! Cancel parallel_reduce task
+template<typename Range, typename Body, typename Partitioner>
+task* start_reduce<Range, Body, Partitioner>::cancel(execution_data& ed) {
+    finalize(ed);
+    return nullptr;
+}
+
+//! Tree node type for parallel_deterministic_reduce.
+/** @ingroup algorithms */
+template<typename Body>
+struct deterministic_reduction_tree_node : public tree_node {
+    Body right_body;
+    Body& left_body;
+
+    deterministic_reduction_tree_node(node* parent, int ref_count, Body& input_left_body, small_object_allocator& alloc) :
+        tree_node{parent, ref_count, alloc},
+        right_body{input_left_body, detail::split()},
+        left_body(input_left_body)
+    {}
+
+    void join(task_group_context* context) {
+        if (!context->is_group_execution_cancelled())
+            left_body.join(right_body);
+    }
+};
+
+//! Task type used to split the work of parallel_deterministic_reduce.
+/** @ingroup algorithms */
+template<typename Range, typename Body, typename Partitioner>
+struct start_deterministic_reduce : public task {
+    Range my_range;
+    Body& my_body;
+    node* my_parent;
+
+    typename Partitioner::task_partition_type my_partition;
+    small_object_allocator my_allocator;
+
+    task* execute(execution_data&) override;
+    task* cancel(execution_data&) override;
+    void finalize(const execution_data&);
+
+    using tree_node_type = deterministic_reduction_tree_node<Body>;
+
+    //! Constructor deterministic_reduce root task.
+    start_deterministic_reduce( const Range& range, Partitioner& partitioner, Body& body, small_object_allocator& alloc ) :
+        my_range(range),
+        my_body(body),
+        my_partition(partitioner),
+        my_allocator(alloc) {}
+    //! Splitting constructor used to generate children.
+    /** parent_ becomes left child.  Newly constructed object is right child. */
+    start_deterministic_reduce( start_deterministic_reduce& parent_, typename Partitioner::split_type& split_obj, Body& body,
+                                small_object_allocator& alloc ) :
+        my_range(parent_.my_range, get_range_split_object<Range>(split_obj)),
+        my_body(body),
+        my_partition(parent_.my_partition, split_obj),
+        my_allocator(alloc) {}
+    static void run(const Range& range, Body& body, Partitioner& partitioner, task_group_context& context) {
+        if ( !range.empty() ) {
+            wait_node wn;
+            small_object_allocator alloc{};
+            auto deterministic_reduce_task =
+                alloc.new_object<start_deterministic_reduce>(range, partitioner, body, alloc);
+            deterministic_reduce_task->my_parent = &wn;
+            execute_and_wait(*deterministic_reduce_task, context, wn.m_wait, context);
+        }
+    }
+    static void run(const Range& range, Body& body, Partitioner& partitioner) {
+        // Bound context prevents exceptions from body to affect nesting or sibling algorithms,
+        // and allows users to handle exceptions safely by wrapping parallel_deterministic_reduce
+        // in the try-block.
+        task_group_context context(PARALLEL_REDUCE);
+        run(range, body, partitioner, context);
+    }
+    //! Run body for range, serves as callback for partitioner
+    void run_body( Range &r ) {
+        my_body( r );
+    }
+    //! Spawn right task, serves as callback for partitioner
+    void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) {
+        offer_work_impl(ed, *this, split_obj);
+    }
+private:
+    template <typename... Args>
+    void offer_work_impl(execution_data& ed, Args&&... args) {
+        small_object_allocator alloc{};
+        // New root node as a continuation and ref count. Left and right child attach to the new parent. Split the body.
+        auto new_tree_node = alloc.new_object<tree_node_type>(ed, my_parent, 2, my_body, alloc);
+
+        // New right child
+        auto right_child = alloc.new_object<start_deterministic_reduce>(ed, std::forward<Args>(args)..., new_tree_node->right_body, alloc);
+
+        right_child->my_parent = my_parent = new_tree_node;
+
+        // Spawn the right sibling
+        right_child->spawn_self(ed);
+    }
+
+    void spawn_self(execution_data& ed) {
+        my_partition.spawn_task(*this, *context(ed));
+    }
+};
+
+//! Fold the tree and deallocate the task
+template<typename Range, typename Body, typename Partitioner>
+void start_deterministic_reduce<Range, Body, Partitioner>::finalize(const execution_data& ed) {
+    // Get the current parent and wait object before an object destruction
+    node* parent = my_parent;
+
+    auto allocator = my_allocator;
+    // Task execution finished - destroy it
+    this->~start_deterministic_reduce();
+    // Unwind the tree decrementing the parent`s reference count
+    fold_tree<tree_node_type>(parent, ed);
+    allocator.deallocate(this, ed);
+}
+
+//! Execute parallel_deterministic_reduce task
+template<typename Range, typename Body, typename Partitioner>
+task* start_deterministic_reduce<Range,Body,Partitioner>::execute(execution_data& ed) {
+    if (!is_same_affinity(ed)) {
+        my_partition.note_affinity(execution_slot(ed));
+    }
+    my_partition.check_being_stolen(*this, ed);
+
+    my_partition.execute(*this, my_range, ed);
+
+    finalize(ed);
+    return NULL;
+}
+
+//! Cancel parallel_deterministic_reduce task
+template<typename Range, typename Body, typename Partitioner>
+task* start_deterministic_reduce<Range, Body, Partitioner>::cancel(execution_data& ed) {
+    finalize(ed);
+    return NULL;
+}
+
+
+//! Auxiliary class for parallel_reduce; for internal use only.
+/** The adaptor class that implements \ref parallel_reduce_body_req "parallel_reduce Body"
+    using given \ref parallel_reduce_lambda_req "anonymous function objects".
+ **/
+/** @ingroup algorithms */
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+class lambda_reduce_body {
+//TODO: decide if my_real_body, my_reduction, and my_identity_element should be copied or referenced
+//       (might require some performance measurements)
+
+    const Value&     my_identity_element;
+    const RealBody&  my_real_body;
+    const Reduction& my_reduction;
+    Value            my_value;
+    lambda_reduce_body& operator= ( const lambda_reduce_body& other );
+public:
+    lambda_reduce_body( const Value& identity, const RealBody& body, const Reduction& reduction )
+        : my_identity_element(identity)
+        , my_real_body(body)
+        , my_reduction(reduction)
+        , my_value(identity)
+    { }
+    lambda_reduce_body( const lambda_reduce_body& other ) = default;
+    lambda_reduce_body( lambda_reduce_body& other, tbb::split )
+        : my_identity_element(other.my_identity_element)
+        , my_real_body(other.my_real_body)
+        , my_reduction(other.my_reduction)
+        , my_value(other.my_identity_element)
+    { }
+    void operator()(Range& range) {
+        my_value = my_real_body(range, const_cast<const Value&>(my_value));
+    }
+    void join( lambda_reduce_body& rhs ) {
+        my_value = my_reduction(const_cast<const Value&>(my_value), const_cast<const Value&>(rhs.my_value));
+    }
+    Value result() const {
+        return my_value;
+    }
+};
+
+
+// Requirements on Range concept are documented in blocked_range.h
+
+/** \page parallel_reduce_body_req Requirements on parallel_reduce body
+    Class \c Body implementing the concept of parallel_reduce body must define:
+    - \code Body::Body( Body&, split ); \endcode        Splitting constructor.
+                                                        Must be able to run concurrently with operator() and method \c join
+    - \code Body::~Body(); \endcode                     Destructor
+    - \code void Body::operator()( Range& r ); \endcode Function call operator applying body to range \c r
+                                                        and accumulating the result
+    - \code void Body::join( Body& b ); \endcode        Join results.
+                                                        The result in \c b should be merged into the result of \c this
+**/
+
+/** \page parallel_reduce_lambda_req Requirements on parallel_reduce anonymous function objects (lambda functions)
+    TO BE DOCUMENTED
+**/
+
+/** \name parallel_reduce
+    See also requirements on \ref range_req "Range" and \ref parallel_reduce_body_req "parallel_reduce Body". **/
+//@{
+
+//! Parallel iteration with reduction and default partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_reduce( const Range& range, Body& body ) {
+    start_reduce<Range,Body, const __TBB_DEFAULT_PARTITIONER>::run( range, body, __TBB_DEFAULT_PARTITIONER() );
+}
+
+//! Parallel iteration with reduction and simple_partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_reduce( const Range& range, Body& body, const simple_partitioner& partitioner ) {
+    start_reduce<Range,Body,const simple_partitioner>::run( range, body, partitioner );
+}
+
+//! Parallel iteration with reduction and auto_partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_reduce( const Range& range, Body& body, const auto_partitioner& partitioner ) {
+    start_reduce<Range,Body,const auto_partitioner>::run( range, body, partitioner );
+}
+
+//! Parallel iteration with reduction and static_partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_reduce( const Range& range, Body& body, const static_partitioner& partitioner ) {
+    start_reduce<Range,Body,const static_partitioner>::run( range, body, partitioner );
+}
+
+//! Parallel iteration with reduction and affinity_partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_reduce( const Range& range, Body& body, affinity_partitioner& partitioner ) {
+    start_reduce<Range,Body,affinity_partitioner>::run( range, body, partitioner );
+}
+
+//! Parallel iteration with reduction, default partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_reduce( const Range& range, Body& body, task_group_context& context ) {
+    start_reduce<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run( range, body, __TBB_DEFAULT_PARTITIONER(), context );
+}
+
+//! Parallel iteration with reduction, simple partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_reduce( const Range& range, Body& body, const simple_partitioner& partitioner, task_group_context& context ) {
+    start_reduce<Range,Body,const simple_partitioner>::run( range, body, partitioner, context );
+}
+
+//! Parallel iteration with reduction, auto_partitioner and user-supplied context
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_reduce( const Range& range, Body& body, const auto_partitioner& partitioner, task_group_context& context ) {
+    start_reduce<Range,Body,const auto_partitioner>::run( range, body, partitioner, context );
+}
+
+//! Parallel iteration with reduction, static_partitioner and user-supplied context
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_reduce( const Range& range, Body& body, const static_partitioner& partitioner, task_group_context& context ) {
+    start_reduce<Range,Body,const static_partitioner>::run( range, body, partitioner, context );
+}
+
+//! Parallel iteration with reduction, affinity_partitioner and user-supplied context
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_reduce( const Range& range, Body& body, affinity_partitioner& partitioner, task_group_context& context ) {
+    start_reduce<Range,Body,affinity_partitioner>::run( range, body, partitioner, context );
+}
+/** parallel_reduce overloads that work with anonymous function objects
+    (see also \ref parallel_reduce_lambda_req "requirements on parallel_reduce anonymous function objects"). **/
+
+//! Parallel iteration with reduction and default partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction ) {
+    lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const __TBB_DEFAULT_PARTITIONER>
+                          ::run(range, body, __TBB_DEFAULT_PARTITIONER() );
+    return body.result();
+}
+
+//! Parallel iteration with reduction and simple_partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+                       const simple_partitioner& partitioner ) {
+    lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
+                          ::run(range, body, partitioner );
+    return body.result();
+}
+
+//! Parallel iteration with reduction and auto_partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+                       const auto_partitioner& partitioner ) {
+    lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
+                          ::run( range, body, partitioner );
+    return body.result();
+}
+
+//! Parallel iteration with reduction and static_partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+                       const static_partitioner& partitioner ) {
+    lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const static_partitioner>
+                                        ::run( range, body, partitioner );
+    return body.result();
+}
+
+//! Parallel iteration with reduction and affinity_partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+                       affinity_partitioner& partitioner ) {
+    lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
+                                        ::run( range, body, partitioner );
+    return body.result();
+}
+
+//! Parallel iteration with reduction, default partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+                       task_group_context& context ) {
+    lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const __TBB_DEFAULT_PARTITIONER>
+                          ::run( range, body, __TBB_DEFAULT_PARTITIONER(), context );
+    return body.result();
+}
+
+//! Parallel iteration with reduction, simple partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+                       const simple_partitioner& partitioner, task_group_context& context ) {
+    lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
+                          ::run( range, body, partitioner, context );
+    return body.result();
+}
+
+//! Parallel iteration with reduction, auto_partitioner and user-supplied context
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+                       const auto_partitioner& partitioner, task_group_context& context ) {
+    lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
+                          ::run( range, body, partitioner, context );
+    return body.result();
+}
+
+//! Parallel iteration with reduction, static_partitioner and user-supplied context
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+                       const static_partitioner& partitioner, task_group_context& context ) {
+    lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const static_partitioner>
+                                        ::run( range, body, partitioner, context );
+    return body.result();
+}
+
+//! Parallel iteration with reduction, affinity_partitioner and user-supplied context
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+                       affinity_partitioner& partitioner, task_group_context& context ) {
+    lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
+                                        ::run( range, body, partitioner, context );
+    return body.result();
+}
+
+//! Parallel iteration with deterministic reduction and default simple partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_deterministic_reduce( const Range& range, Body& body ) {
+    start_deterministic_reduce<Range, Body, const simple_partitioner>::run(range, body, simple_partitioner());
+}
+
+//! Parallel iteration with deterministic reduction and simple partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_deterministic_reduce( const Range& range, Body& body, const simple_partitioner& partitioner ) {
+    start_deterministic_reduce<Range, Body, const simple_partitioner>::run(range, body, partitioner);
+}
+
+//! Parallel iteration with deterministic reduction and static partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_deterministic_reduce( const Range& range, Body& body, const static_partitioner& partitioner ) {
+    start_deterministic_reduce<Range, Body, const static_partitioner>::run(range, body, partitioner);
+}
+
+//! Parallel iteration with deterministic reduction, default simple partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_deterministic_reduce( const Range& range, Body& body, task_group_context& context ) {
+    start_deterministic_reduce<Range,Body, const simple_partitioner>::run( range, body, simple_partitioner(), context );
+}
+
+//! Parallel iteration with deterministic reduction, simple partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_deterministic_reduce( const Range& range, Body& body, const simple_partitioner& partitioner, task_group_context& context ) {
+    start_deterministic_reduce<Range, Body, const simple_partitioner>::run(range, body, partitioner, context);
+}
+
+//! Parallel iteration with deterministic reduction, static partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_deterministic_reduce( const Range& range, Body& body, const static_partitioner& partitioner, task_group_context& context ) {
+    start_deterministic_reduce<Range, Body, const static_partitioner>::run(range, body, partitioner, context);
+}
+
+/** parallel_reduce overloads that work with anonymous function objects
+    (see also \ref parallel_reduce_lambda_req "requirements on parallel_reduce anonymous function objects"). **/
+
+//! Parallel iteration with deterministic reduction and default simple partitioner.
+// TODO: consider making static_partitioner the default
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction ) {
+    return parallel_deterministic_reduce(range, identity, real_body, reduction, simple_partitioner());
+}
+
+//! Parallel iteration with deterministic reduction and simple partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, const simple_partitioner& partitioner ) {
+    lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    start_deterministic_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>, const simple_partitioner>
+                          ::run(range, body, partitioner);
+    return body.result();
+}
+
+//! Parallel iteration with deterministic reduction and static partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, const static_partitioner& partitioner ) {
+    lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
+    start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const static_partitioner>
+        ::run(range, body, partitioner);
+    return body.result();
+}
+
+//! Parallel iteration with deterministic reduction, default simple partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+    task_group_context& context ) {
+    return parallel_deterministic_reduce(range, identity, real_body, reduction, simple_partitioner(), context);
+}
+
+//! Parallel iteration with deterministic reduction, simple partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+    const simple_partitioner& partitioner, task_group_context& context ) {
+    lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
+    start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const simple_partitioner>
+        ::run(range, body, partitioner, context);
+    return body.result();
+}
+
+//! Parallel iteration with deterministic reduction, static partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+    const static_partitioner& partitioner, task_group_context& context ) {
+    lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
+    start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const static_partitioner>
+        ::run(range, body, partitioner, context);
+    return body.result();
+}
+//@}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::parallel_reduce;
+using detail::d1::parallel_deterministic_reduce;
+// Split types
+using detail::split;
+using detail::proportional_split;
+} // namespace v1
+
+} // namespace tbb
+#endif /* __TBB_parallel_reduce_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/parallel_scan.h b/contrib/libs/tbb/include/oneapi/tbb/parallel_scan.h
new file mode 100644
index 0000000000..d5d69ca0b2
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/parallel_scan.h
@@ -0,0 +1,590 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_parallel_scan_H
+#define __TBB_parallel_scan_H
+
+#include <functional>
+
+#include "detail/_config.h"
+#include "detail/_namespace_injection.h"
+#include "detail/_exception.h"
+#include "detail/_task.h"
+
+#include "profiling.h"
+#include "partitioner.h"
+#include "blocked_range.h"
+#include "task_group.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! Used to indicate that the initial scan is being performed.
+/** @ingroup algorithms */
+struct pre_scan_tag {
+    static bool is_final_scan() {return false;}
+    operator bool() {return is_final_scan();}
+};
+
+//! Used to indicate that the final scan is being performed.
+/** @ingroup algorithms */
+struct final_scan_tag {
+    static bool is_final_scan() {return true;}
+    operator bool() {return is_final_scan();}
+};
+
+template<typename Range, typename Body>
+struct sum_node;
+
+//! Performs final scan for a leaf
+/** @ingroup algorithms */
+template<typename Range, typename Body>
+struct final_sum : public task {
+private:
+    using sum_node_type = sum_node<Range, Body>;
+    Body m_body;
+    aligned_space<Range> m_range;
+    //! Where to put result of last subrange, or nullptr if not last subrange.
+    Body* m_stuff_last;
+
+    wait_context& m_wait_context;
+    sum_node_type* m_parent = nullptr;
+public:
+    small_object_allocator m_allocator;
+    final_sum( Body& body, wait_context& w_o, small_object_allocator& alloc ) :
+        m_body(body, split()), m_wait_context(w_o), m_allocator(alloc) {
+        poison_pointer(m_stuff_last);
+    }
+
+    final_sum( final_sum& sum, small_object_allocator& alloc ) :
+        m_body(sum.m_body, split()), m_wait_context(sum.m_wait_context), m_allocator(alloc) {
+        poison_pointer(m_stuff_last);
+    }
+
+    ~final_sum() {
+        m_range.begin()->~Range();
+    }
+    void finish_construction( sum_node_type* parent, const Range& range, Body* stuff_last ) {
+        __TBB_ASSERT( m_parent == nullptr, nullptr );
+        m_parent = parent;
+        new( m_range.begin() ) Range(range);
+        m_stuff_last = stuff_last;
+    }
+private:
+    sum_node_type* release_parent() {
+        call_itt_task_notify(releasing, m_parent);
+        if (m_parent) {
+            auto parent = m_parent;
+            m_parent = nullptr;
+            if (parent->ref_count.fetch_sub(1, std::memory_order_relaxed) == 1) {
+                return parent;
+            }
+        }
+        else
+            m_wait_context.release();
+        return nullptr;
+    }
+    sum_node_type* finalize(const execution_data& ed){
+        sum_node_type* next_task = release_parent();
+        m_allocator.delete_object<final_sum>(this, ed);
+        return next_task;
+    }
+
+public:
+    task* execute(execution_data& ed) override {
+        m_body( *m_range.begin(), final_scan_tag() );
+        if( m_stuff_last )
+            m_stuff_last->assign(m_body);
+
+        return finalize(ed);
+    }
+    task* cancel(execution_data& ed) override {
+        return finalize(ed);
+    }
+    template<typename Tag>
+    void operator()( const Range& r, Tag tag ) {
+        m_body( r, tag );
+    }
+    void reverse_join( final_sum& a ) {
+        m_body.reverse_join(a.m_body);
+    }
+    void reverse_join( Body& body ) {
+        m_body.reverse_join(body);
+    }
+    void assign_to( Body& body ) {
+        body.assign(m_body);
+    }
+    void self_destroy(const execution_data& ed) {
+        m_allocator.delete_object<final_sum>(this, ed);
+    }
+};
+
+//! Split work to be done in the scan.
+/** @ingroup algorithms */
+template<typename Range, typename Body>
+struct sum_node : public task {
+private:
+    using final_sum_type = final_sum<Range,Body>;
+public:
+    final_sum_type *m_incoming;
+    final_sum_type *m_body;
+    Body *m_stuff_last;
+private:
+    final_sum_type *m_left_sum;
+    sum_node *m_left;
+    sum_node *m_right;
+    bool m_left_is_final;
+    Range m_range;
+    wait_context& m_wait_context;
+    sum_node* m_parent;
+    small_object_allocator m_allocator;
+public:
+    std::atomic<unsigned int> ref_count{0};
+    sum_node( const Range range, bool left_is_final_, sum_node* parent, wait_context& w_o, small_object_allocator& alloc ) :
+        m_stuff_last(nullptr),
+        m_left_sum(nullptr),
+        m_left(nullptr),
+        m_right(nullptr),
+        m_left_is_final(left_is_final_),
+        m_range(range),
+        m_wait_context(w_o),
+        m_parent(parent),
+        m_allocator(alloc)
+    {
+        if( m_parent )
+            m_parent->ref_count.fetch_add(1, std::memory_order_relaxed);
+        // Poison fields that will be set by second pass.
+        poison_pointer(m_body);
+        poison_pointer(m_incoming);
+    }
+
+    ~sum_node() {
+        if (m_parent)
+            m_parent->ref_count.fetch_sub(1, std::memory_order_relaxed);
+    }
+private:
+    sum_node* release_parent() {
+        call_itt_task_notify(releasing, m_parent);
+        if (m_parent) {
+            auto parent = m_parent;
+            m_parent = nullptr;
+            if (parent->ref_count.fetch_sub(1, std::memory_order_relaxed) == 1) {
+                return parent;
+            }
+        }
+        else
+            m_wait_context.release();
+        return nullptr;
+    }
+    task* create_child( const Range& range, final_sum_type& body, sum_node* child, final_sum_type* incoming, Body* stuff_last ) {
+        if( child ) {
+            __TBB_ASSERT( is_poisoned(child->m_body) && is_poisoned(child->m_incoming), nullptr );
+            child->prepare_for_execution(body, incoming, stuff_last);
+            return child;
+        } else {
+            body.finish_construction(this, range, stuff_last);
+            return &body;
+        }
+    }
+
+    sum_node* finalize(const execution_data& ed) {
+        sum_node* next_task = release_parent();
+        m_allocator.delete_object<sum_node>(this, ed);
+        return next_task;
+    }
+
+public:
+    void prepare_for_execution(final_sum_type& body, final_sum_type* incoming, Body *stuff_last) {
+        this->m_body = &body;
+        this->m_incoming = incoming;
+        this->m_stuff_last = stuff_last;
+    }
+    task* execute(execution_data& ed) override {
+        if( m_body ) {
+            if( m_incoming )
+                m_left_sum->reverse_join( *m_incoming );
+            task* right_child = this->create_child(Range(m_range,split()), *m_left_sum, m_right, m_left_sum, m_stuff_last);
+            task* left_child = m_left_is_final ? nullptr : this->create_child(m_range, *m_body, m_left, m_incoming, nullptr);
+            ref_count = (left_child != nullptr) + (right_child != nullptr);
+            m_body = nullptr;
+            if( left_child ) {
+                spawn(*right_child, *ed.context);
+                return left_child;
+            } else {
+                return right_child;
+            }
+        } else {
+            return finalize(ed);
+        }
+    }
+    task* cancel(execution_data& ed) override {
+        return finalize(ed);
+    }
+    void self_destroy(const execution_data& ed) {
+        m_allocator.delete_object<sum_node>(this, ed);
+    }
+    template<typename range,typename body,typename partitioner>
+    friend struct start_scan;
+
+    template<typename range,typename body>
+    friend struct finish_scan;
+};
+
+//! Combine partial results
+/** @ingroup algorithms */
+template<typename Range, typename Body>
+struct finish_scan : public task {
+private:
+    using sum_node_type = sum_node<Range,Body>;
+    using final_sum_type = final_sum<Range,Body>;
+    final_sum_type** const m_sum_slot;
+    sum_node_type*& m_return_slot;
+    small_object_allocator m_allocator;
+public:
+    final_sum_type* m_right_zombie;
+    sum_node_type& m_result;
+    std::atomic<unsigned int> ref_count{2};
+    finish_scan*  m_parent;
+    wait_context& m_wait_context;
+    task* execute(execution_data& ed) override {
+        __TBB_ASSERT( m_result.ref_count.load() == static_cast<unsigned int>((m_result.m_left!=nullptr)+(m_result.m_right!=nullptr)), nullptr );
+        if( m_result.m_left )
+            m_result.m_left_is_final = false;
+        if( m_right_zombie && m_sum_slot )
+            (*m_sum_slot)->reverse_join(*m_result.m_left_sum);
+        __TBB_ASSERT( !m_return_slot, nullptr );
+        if( m_right_zombie || m_result.m_right ) {
+            m_return_slot = &m_result;
+        } else {
+            m_result.self_destroy(ed);
+        }
+        if( m_right_zombie && !m_sum_slot && !m_result.m_right ) {
+            m_right_zombie->self_destroy(ed);
+            m_right_zombie = nullptr;
+        }
+        return finalize(ed);
+    }
+    task* cancel(execution_data& ed) override {
+        return finalize(ed);
+    }
+    finish_scan(sum_node_type*& return_slot, final_sum_type** sum, sum_node_type& result_, finish_scan* parent, wait_context& w_o, small_object_allocator& alloc) :
+        m_sum_slot(sum),
+        m_return_slot(return_slot),
+        m_allocator(alloc),
+        m_right_zombie(nullptr),
+        m_result(result_),
+        m_parent(parent),
+        m_wait_context(w_o)
+    {
+        __TBB_ASSERT( !m_return_slot, nullptr );
+    }
+private:
+    finish_scan* release_parent() {
+        call_itt_task_notify(releasing, m_parent);
+        if (m_parent) {
+            auto parent = m_parent;
+            m_parent = nullptr;
+            if (parent->ref_count.fetch_sub(1, std::memory_order_relaxed) == 1) {
+                return parent;
+            }
+        }
+        else
+            m_wait_context.release();
+        return nullptr;
+    }
+    finish_scan* finalize(const execution_data& ed) {
+        finish_scan* next_task = release_parent();
+        m_allocator.delete_object<finish_scan>(this, ed);
+        return next_task;
+    }
+};
+
+//! Initial task to split the work
+/** @ingroup algorithms */
+template<typename Range, typename Body, typename Partitioner>
+struct start_scan : public task {
+private:
+    using sum_node_type = sum_node<Range,Body>;
+    using final_sum_type = final_sum<Range,Body>;
+    using finish_pass1_type = finish_scan<Range,Body>;
+    std::reference_wrapper<sum_node_type*> m_return_slot;
+    Range m_range;
+    std::reference_wrapper<final_sum_type> m_body;
+    typename Partitioner::partition_type m_partition;
+    /** Non-null if caller is requesting total. */
+    final_sum_type** m_sum_slot;
+    bool m_is_final;
+    bool m_is_right_child;
+
+    finish_pass1_type*  m_parent;
+    small_object_allocator m_allocator;
+    wait_context& m_wait_context;
+
+    finish_pass1_type* release_parent() {
+        call_itt_task_notify(releasing, m_parent);
+        if (m_parent) {
+            auto parent = m_parent;
+            m_parent = nullptr;
+            if (parent->ref_count.fetch_sub(1, std::memory_order_relaxed) == 1) {
+                return parent;
+            }
+        }
+        else
+            m_wait_context.release();
+        return nullptr;
+    }
+
+    finish_pass1_type* finalize( const execution_data& ed ) {
+        finish_pass1_type* next_task = release_parent();
+        m_allocator.delete_object<start_scan>(this, ed);
+        return next_task;
+    }
+
+public:
+    task* execute( execution_data& ) override;
+    task* cancel( execution_data& ed ) override {
+        return finalize(ed);
+    }
+    start_scan( sum_node_type*& return_slot, start_scan& parent, small_object_allocator& alloc ) :
+        m_return_slot(return_slot),
+        m_range(parent.m_range,split()),
+        m_body(parent.m_body),
+        m_partition(parent.m_partition,split()),
+        m_sum_slot(parent.m_sum_slot),
+        m_is_final(parent.m_is_final),
+        m_is_right_child(true),
+        m_parent(parent.m_parent),
+        m_allocator(alloc),
+        m_wait_context(parent.m_wait_context)
+    {
+        __TBB_ASSERT( !m_return_slot, nullptr );
+        parent.m_is_right_child = false;
+    }
+
+    start_scan( sum_node_type*& return_slot, const Range& range, final_sum_type& body, const Partitioner& partitioner, wait_context& w_o, small_object_allocator& alloc ) :
+        m_return_slot(return_slot),
+        m_range(range),
+        m_body(body),
+        m_partition(partitioner),
+        m_sum_slot(nullptr),
+        m_is_final(true),
+        m_is_right_child(false),
+        m_parent(nullptr),
+        m_allocator(alloc),
+        m_wait_context(w_o)
+    {
+        __TBB_ASSERT( !m_return_slot, nullptr );
+    }
+
+    static void run( const Range& range, Body& body, const Partitioner& partitioner ) {
+        if( !range.empty() ) {
+            task_group_context context(PARALLEL_SCAN);
+
+            using start_pass1_type = start_scan<Range,Body,Partitioner>;
+            sum_node_type* root = nullptr;
+            wait_context w_ctx{1};
+            small_object_allocator alloc{};
+
+            auto& temp_body = *alloc.new_object<final_sum_type>(body, w_ctx, alloc);
+            temp_body.reverse_join(body);
+
+            auto& pass1 = *alloc.new_object<start_pass1_type>(/*m_return_slot=*/root, range, temp_body, partitioner, w_ctx, alloc);
+
+            execute_and_wait(pass1, context, w_ctx, context);
+            if( root ) {
+                root->prepare_for_execution(temp_body, nullptr, &body);
+                w_ctx.reserve();
+                execute_and_wait(*root, context, w_ctx, context);
+            } else {
+                temp_body.assign_to(body);
+                temp_body.finish_construction(nullptr, range, nullptr);
+                alloc.delete_object<final_sum_type>(&temp_body);
+            }
+        }
+    }
+};
+
+template<typename Range, typename Body, typename Partitioner>
+task* start_scan<Range,Body,Partitioner>::execute( execution_data& ed ) {
+    // Inspecting m_parent->result.left_sum would ordinarily be a race condition.
+    // But we inspect it only if we are not a stolen task, in which case we
+    // know that task assigning to m_parent->result.left_sum has completed.
+    __TBB_ASSERT(!m_is_right_child || m_parent, "right child is never an orphan");
+    bool treat_as_stolen = m_is_right_child && (is_stolen(ed) || &m_body.get()!=m_parent->m_result.m_left_sum);
+    if( treat_as_stolen ) {
+        // Invocation is for right child that has been really stolen or needs to be virtually stolen
+        small_object_allocator alloc{};
+        m_parent->m_right_zombie = alloc.new_object<final_sum_type>(m_body, alloc);
+        m_body = *m_parent->m_right_zombie;
+        m_is_final = false;
+    }
+    task* next_task = nullptr;
+    if( (m_is_right_child && !treat_as_stolen) || !m_range.is_divisible() || m_partition.should_execute_range(ed) ) {
+        if( m_is_final )
+            m_body(m_range, final_scan_tag());
+        else if( m_sum_slot )
+            m_body(m_range, pre_scan_tag());
+        if( m_sum_slot )
+            *m_sum_slot = &m_body.get();
+        __TBB_ASSERT( !m_return_slot, nullptr );
+
+        next_task = finalize(ed);
+    } else {
+        small_object_allocator alloc{};
+        auto result = alloc.new_object<sum_node_type>(m_range,/*m_left_is_final=*/m_is_final, m_parent? &m_parent->m_result: nullptr, m_wait_context, alloc);
+
+        auto new_parent = alloc.new_object<finish_pass1_type>(m_return_slot, m_sum_slot, *result, m_parent, m_wait_context, alloc);
+        m_parent = new_parent;
+
+        // Split off right child
+        auto& right_child = *alloc.new_object<start_scan>(/*m_return_slot=*/result->m_right, *this, alloc);
+
+        spawn(right_child, *ed.context);
+
+        m_sum_slot = &result->m_left_sum;
+        m_return_slot = result->m_left;
+
+        __TBB_ASSERT( !m_return_slot, nullptr );
+        next_task = this;
+    }
+    return next_task;
+}
+
+template<typename Range, typename Value, typename Scan, typename ReverseJoin>
+class lambda_scan_body {
+    Value               m_sum_slot;
+    const Value&        identity_element;
+    const Scan&         m_scan;
+    const ReverseJoin&  m_reverse_join;
+public:
+    void operator=(const lambda_scan_body&) = delete;
+    lambda_scan_body(const lambda_scan_body&) = default;
+
+    lambda_scan_body( const Value& identity, const Scan& scan, const ReverseJoin& rev_join )
+        : m_sum_slot(identity)
+        , identity_element(identity)
+        , m_scan(scan)
+        , m_reverse_join(rev_join) {}
+
+    lambda_scan_body( lambda_scan_body& b, split )
+        : m_sum_slot(b.identity_element)
+        , identity_element(b.identity_element)
+        , m_scan(b.m_scan)
+        , m_reverse_join(b.m_reverse_join) {}
+
+    template<typename Tag>
+    void operator()( const Range& r, Tag tag ) {
+        m_sum_slot = m_scan(r, m_sum_slot, tag);
+    }
+
+    void reverse_join( lambda_scan_body& a ) {
+        m_sum_slot = m_reverse_join(a.m_sum_slot, m_sum_slot);
+    }
+
+    void assign( lambda_scan_body& b ) {
+        m_sum_slot = b.m_sum_slot;
+    }
+
+    Value result() const {
+        return m_sum_slot;
+    }
+};
+
+// Requirements on Range concept are documented in blocked_range.h
+
+/** \page parallel_scan_body_req Requirements on parallel_scan body
+    Class \c Body implementing the concept of parallel_scan body must define:
+    - \code Body::Body( Body&, split ); \endcode    Splitting constructor.
+                                                    Split \c b so that \c this and \c b can accumulate separately
+    - \code Body::~Body(); \endcode                 Destructor
+    - \code void Body::operator()( const Range& r, pre_scan_tag ); \endcode
+                                                    Preprocess iterations for range \c r
+    - \code void Body::operator()( const Range& r, final_scan_tag ); \endcode
+                                                    Do final processing for iterations of range \c r
+    - \code void Body::reverse_join( Body& a ); \endcode
+                                                    Merge preprocessing state of \c a into \c this, where \c a was
+                                                    created earlier from \c b by b's splitting constructor
+**/
+
+/** \name parallel_scan
+    See also requirements on \ref range_req "Range" and \ref parallel_scan_body_req "parallel_scan Body". **/
+//@{
+
+//! Parallel prefix with default partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_scan( const Range& range, Body& body ) {
+    start_scan<Range, Body, auto_partitioner>::run(range,body,__TBB_DEFAULT_PARTITIONER());
+}
+
+//! Parallel prefix with simple_partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_scan( const Range& range, Body& body, const simple_partitioner& partitioner ) {
+    start_scan<Range, Body, simple_partitioner>::run(range, body, partitioner);
+}
+
+//! Parallel prefix with auto_partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_scan( const Range& range, Body& body, const auto_partitioner& partitioner ) {
+    start_scan<Range,Body,auto_partitioner>::run(range, body, partitioner);
+}
+
+//! Parallel prefix with default partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename Scan, typename ReverseJoin>
+Value parallel_scan( const Range& range, const Value& identity, const Scan& scan, const ReverseJoin& reverse_join ) {
+    lambda_scan_body<Range, Value, Scan, ReverseJoin> body(identity, scan, reverse_join);
+    parallel_scan(range, body, __TBB_DEFAULT_PARTITIONER());
+    return body.result();
+}
+
+//! Parallel prefix with simple_partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename Scan, typename ReverseJoin>
+Value parallel_scan( const Range& range, const Value& identity, const Scan& scan, const ReverseJoin& reverse_join,
+                     const simple_partitioner& partitioner ) {
+    lambda_scan_body<Range, Value, Scan, ReverseJoin> body(identity, scan, reverse_join);
+    parallel_scan(range, body, partitioner);
+    return body.result();
+}
+
+//! Parallel prefix with auto_partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename Scan, typename ReverseJoin>
+Value parallel_scan( const Range& range, const Value& identity, const Scan& scan, const ReverseJoin& reverse_join,
+                     const auto_partitioner& partitioner ) {
+    lambda_scan_body<Range, Value, Scan, ReverseJoin> body(identity, scan, reverse_join);
+    parallel_scan(range, body, partitioner);
+    return body.result();
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+    using detail::d1::parallel_scan;
+    using detail::d1::pre_scan_tag;
+    using detail::d1::final_scan_tag;
+
+} // namespace v1
+
+} // namespace tbb
+
+#endif /* __TBB_parallel_scan_H */
+
diff --git a/contrib/libs/tbb/include/oneapi/tbb/parallel_sort.h b/contrib/libs/tbb/include/oneapi/tbb/parallel_sort.h
new file mode 100644
index 0000000000..0e7be5e25b
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/parallel_sort.h
@@ -0,0 +1,247 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_parallel_sort_H
+#define __TBB_parallel_sort_H
+
+#include "detail/_namespace_injection.h"
+#include "parallel_for.h"
+#include "blocked_range.h"
+#include "profiling.h"
+
+#include <algorithm>
+#include <iterator>
+#include <functional>
+#include <cstddef>
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! Range used in quicksort to split elements into subranges based on a value.
+/** The split operation selects a splitter and places all elements less than or equal
+    to the value in the first range and the remaining elements in the second range.
+    @ingroup algorithms */
+template<typename RandomAccessIterator, typename Compare>
+class quick_sort_range {
+    std::size_t median_of_three( const RandomAccessIterator& array, std::size_t l, std::size_t m, std::size_t r ) const {
+        return comp(array[l], array[m]) ? ( comp(array[m], array[r]) ? m : ( comp(array[l], array[r]) ? r : l ) )
+                                        : ( comp(array[r], array[m]) ? m : ( comp(array[r], array[l]) ? r : l ) );
+    }
+
+    std::size_t pseudo_median_of_nine( const RandomAccessIterator& array, const quick_sort_range& range ) const {
+        std::size_t offset = range.size / 8u;
+        return median_of_three(array,
+                               median_of_three(array, 0 , offset, offset * 2),
+                               median_of_three(array, offset * 3, offset * 4, offset * 5),
+                               median_of_three(array, offset * 6, offset * 7, range.size - 1));
+
+    }
+
+    std::size_t split_range( quick_sort_range& range ) {
+        RandomAccessIterator array = range.begin;
+        RandomAccessIterator first_element = range.begin;
+        std::size_t m = pseudo_median_of_nine(array, range);
+        if( m != 0 ) std::iter_swap(array, array + m);
+
+        std::size_t i = 0;
+        std::size_t j = range.size;
+        // Partition interval [i + 1,j - 1] with key *first_element.
+        for(;;) {
+            __TBB_ASSERT( i < j, nullptr );
+            // Loop must terminate since array[l] == *first_element.
+            do {
+                --j;
+                __TBB_ASSERT( i <= j, "bad ordering relation?" );
+            } while( comp(*first_element, array[j]) );
+            do {
+                __TBB_ASSERT( i <= j, nullptr );
+                if( i == j ) goto partition;
+                ++i;
+            } while( comp(array[i], *first_element) );
+            if( i == j ) goto partition;
+            std::iter_swap(array + i, array + j);
+        }
+partition:
+        // Put the partition key were it belongs
+        std::iter_swap(array + j, first_element);
+        // array[l..j) is less or equal to key.
+        // array(j..r) is greater or equal to key.
+        // array[j] is equal to key
+        i = j + 1;
+        std::size_t new_range_size = range.size - i;
+        range.size = j;
+        return new_range_size;
+    }
+
+public:
+    quick_sort_range() = default;
+    quick_sort_range( const quick_sort_range& ) = default;
+    void operator=( const quick_sort_range& ) = delete;
+
+    static constexpr std::size_t grainsize = 500;
+    const Compare& comp;
+    std::size_t size;
+    RandomAccessIterator begin;
+
+    quick_sort_range( RandomAccessIterator begin_, std::size_t size_, const Compare& comp_ ) :
+        comp(comp_), size(size_), begin(begin_) {}
+
+    bool empty() const { return size == 0; }
+    bool is_divisible() const { return size >= grainsize; }
+
+    quick_sort_range( quick_sort_range& range, split )
+        : comp(range.comp)
+        , size(split_range(range))
+          // +1 accounts for the pivot element, which is at its correct place
+          // already and, therefore, is not included into subranges.
+        , begin(range.begin + range.size + 1) {}
+};
+
+//! Body class used to test if elements in a range are presorted
+/** @ingroup algorithms */
+template<typename RandomAccessIterator, typename Compare>
+class quick_sort_pretest_body {
+    const Compare& comp;
+    task_group_context& context;
+
+public:
+    quick_sort_pretest_body() = default;
+    quick_sort_pretest_body( const quick_sort_pretest_body& ) = default;
+    void operator=( const quick_sort_pretest_body& ) = delete;
+
+    quick_sort_pretest_body( const Compare& _comp, task_group_context& _context ) : comp(_comp), context(_context) {}
+
+    void operator()( const blocked_range<RandomAccessIterator>& range ) const {
+        RandomAccessIterator my_end = range.end();
+
+        int i = 0;
+        //TODO: consider using std::is_sorted() for each 64 iterations (requires performance measurements)
+        for( RandomAccessIterator k = range.begin(); k != my_end; ++k, ++i ) {
+            if( i % 64 == 0 && context.is_group_execution_cancelled() ) break;
+
+            // The k - 1 is never out-of-range because the first chunk starts at begin+serial_cutoff+1
+            if( comp(*(k), *(k - 1)) ) {
+                context.cancel_group_execution();
+                break;
+            }
+        }
+    }
+};
+
+//! Body class used to sort elements in a range that is smaller than the grainsize.
+/** @ingroup algorithms */
+template<typename RandomAccessIterator, typename Compare>
+struct quick_sort_body {
+    void operator()( const quick_sort_range<RandomAccessIterator,Compare>& range ) const {
+        std::sort(range.begin, range.begin + range.size, range.comp);
+    }
+};
+
+//! Method to perform parallel_for based quick sort.
+/** @ingroup algorithms */
+template<typename RandomAccessIterator, typename Compare>
+void do_parallel_quick_sort( RandomAccessIterator begin, RandomAccessIterator end, const Compare& comp ) {
+    parallel_for(quick_sort_range<RandomAccessIterator,Compare>(begin, end - begin, comp),
+                 quick_sort_body<RandomAccessIterator,Compare>(),
+                 auto_partitioner());
+}
+
+//! Wrapper method to initiate the sort by calling parallel_for.
+/** @ingroup algorithms */
+template<typename RandomAccessIterator, typename Compare>
+void parallel_quick_sort( RandomAccessIterator begin, RandomAccessIterator end, const Compare& comp ) {
+    task_group_context my_context(PARALLEL_SORT);
+    constexpr int serial_cutoff = 9;
+
+    __TBB_ASSERT( begin + serial_cutoff < end, "min_parallel_size is smaller than serial cutoff?" );
+    RandomAccessIterator k = begin;
+    for( ; k != begin + serial_cutoff; ++k ) {
+        if( comp(*(k + 1), *k) ) {
+            do_parallel_quick_sort(begin, end, comp);
+        }
+    }
+
+    // Check is input range already sorted
+    parallel_for(blocked_range<RandomAccessIterator>(k + 1, end),
+                 quick_sort_pretest_body<RandomAccessIterator, Compare>(comp, my_context),
+                 auto_partitioner(),
+                 my_context);
+
+    if( my_context.is_group_execution_cancelled() )
+        do_parallel_quick_sort(begin, end, comp);
+}
+
+/** \page parallel_sort_iter_req Requirements on iterators for parallel_sort
+    Requirements on the iterator type \c It and its value type \c T for \c parallel_sort:
+
+    - \code void iter_swap( It a, It b ) \endcode Swaps the values of the elements the given
+    iterators \c a and \c b are pointing to. \c It should be a random access iterator.
+
+    - \code bool Compare::operator()( const T& x, const T& y ) \endcode True if x comes before y;
+**/
+
+/** \name parallel_sort
+    See also requirements on \ref parallel_sort_iter_req "iterators for parallel_sort". **/
+//@{
+
+//! Sorts the data in [begin,end) using the given comparator
+/** The compare function object is used for all comparisons between elements during sorting.
+    The compare object must define a bool operator() function.
+    @ingroup algorithms **/
+template<typename RandomAccessIterator, typename Compare>
+void parallel_sort( RandomAccessIterator begin, RandomAccessIterator end, const Compare& comp ) {
+    constexpr int min_parallel_size = 500;
+    if( end > begin ) {
+        if( end - begin < min_parallel_size ) {
+            std::sort(begin, end, comp);
+        } else {
+            parallel_quick_sort(begin, end, comp);
+        }
+    }
+}
+
+//! Sorts the data in [begin,end) with a default comparator \c std::less<RandomAccessIterator>
+/** @ingroup algorithms **/
+template<typename RandomAccessIterator>
+void parallel_sort( RandomAccessIterator begin, RandomAccessIterator end ) {
+    parallel_sort(begin, end, std::less<typename std::iterator_traits<RandomAccessIterator>::value_type>());
+}
+
+//! Sorts the data in rng using the given comparator
+/** @ingroup algorithms **/
+template<typename Range, typename Compare>
+void parallel_sort( Range& rng, const Compare& comp ) {
+    parallel_sort(std::begin(rng), std::end(rng), comp);
+}
+
+//! Sorts the data in rng with a default comparator \c std::less<RandomAccessIterator>
+/** @ingroup algorithms **/
+template<typename Range>
+void parallel_sort( Range& rng ) {
+    parallel_sort(std::begin(rng), std::end(rng));
+}
+//@}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+    using detail::d1::parallel_sort;
+} // namespace v1
+} // namespace tbb
+
+#endif /*__TBB_parallel_sort_H*/
diff --git a/contrib/libs/tbb/include/oneapi/tbb/partitioner.h b/contrib/libs/tbb/include/oneapi/tbb/partitioner.h
new file mode 100644
index 0000000000..37ac0a09d9
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/partitioner.h
@@ -0,0 +1,688 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_partitioner_H
+#define __TBB_partitioner_H
+
+#ifndef __TBB_INITIAL_CHUNKS
+// initial task divisions per thread
+#define __TBB_INITIAL_CHUNKS 2
+#endif
+#ifndef __TBB_RANGE_POOL_CAPACITY
+// maximum number of elements in range pool
+#define __TBB_RANGE_POOL_CAPACITY 8
+#endif
+#ifndef __TBB_INIT_DEPTH
+// initial value for depth of range pool
+#define __TBB_INIT_DEPTH 5
+#endif
+#ifndef __TBB_DEMAND_DEPTH_ADD
+// when imbalance is found range splits this value times more
+#define __TBB_DEMAND_DEPTH_ADD 1
+#endif
+
+#include "detail/_config.h"
+#include "detail/_namespace_injection.h"
+#include "detail/_aligned_space.h"
+#include "detail/_utils.h"
+#include "detail/_template_helpers.h"
+#include "detail/_range_common.h"
+#include "detail/_task.h"
+#include "detail/_small_object_pool.h"
+
+#include "cache_aligned_allocator.h"
+#include "task_group.h" // task_group_context
+#include "task_arena.h"
+
+#include <algorithm>
+#include <atomic>
+#include <type_traits>
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+    // Workaround for overzealous compiler warnings
+    #pragma warning (push)
+    #pragma warning (disable: 4244)
+#endif
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+class auto_partitioner;
+class simple_partitioner;
+class static_partitioner;
+class affinity_partitioner;
+class affinity_partition_type;
+class affinity_partitioner_base;
+
+inline std::size_t get_initial_auto_partitioner_divisor() {
+    const std::size_t factor = 4;
+    return factor * max_concurrency();
+}
+
+//! Defines entry point for affinity partitioner into oneTBB run-time library.
+class affinity_partitioner_base: no_copy {
+    friend class affinity_partitioner;
+    friend class affinity_partition_type;
+    //! Array that remembers affinities of tree positions to affinity_id.
+    /** NULL if my_size==0. */
+    slot_id* my_array;
+    //! Number of elements in my_array.
+    std::size_t my_size;
+    //! Zeros the fields.
+    affinity_partitioner_base() : my_array(nullptr), my_size(0) {}
+    //! Deallocates my_array.
+    ~affinity_partitioner_base() { resize(0); }
+    //! Resize my_array.
+    /** Retains values if resulting size is the same. */
+    void resize(unsigned factor) {
+        // Check factor to avoid asking for number of workers while there might be no arena.
+        unsigned max_threads_in_arena = max_concurrency();
+        std::size_t new_size = factor ? factor * max_threads_in_arena : 0;
+        if (new_size != my_size) {
+            if (my_array) {
+                r1::cache_aligned_deallocate(my_array);
+                // Following two assignments must be done here for sake of exception safety.
+                my_array = nullptr;
+                my_size = 0;
+            }
+            if (new_size) {
+                my_array = static_cast<slot_id*>(r1::cache_aligned_allocate(new_size * sizeof(slot_id)));
+                std::fill_n(my_array, new_size, no_slot);
+                my_size = new_size;
+            }
+        }
+    }
+};
+
+template<typename Range, typename Body, typename Partitioner> struct start_for;
+template<typename Range, typename Body, typename Partitioner> struct start_scan;
+template<typename Range, typename Body, typename Partitioner> struct start_reduce;
+template<typename Range, typename Body, typename Partitioner> struct start_deterministic_reduce;
+
+struct node {
+    node* my_parent{};
+    std::atomic<int> m_ref_count{};
+
+    node() = default;
+    node(node* parent, int ref_count) :
+        my_parent{parent}, m_ref_count{ref_count} {
+        __TBB_ASSERT(ref_count > 0, "The ref count must be positive");
+    }
+};
+
+struct wait_node : node {
+    wait_node() : node{ nullptr, 1 } {}
+    wait_context m_wait{1};
+};
+
+//! Join task node that contains shared flag for stealing feedback
+struct tree_node : public node {
+    small_object_allocator m_allocator;
+    std::atomic<bool> m_child_stolen{false};
+
+    tree_node(node* parent, int ref_count, small_object_allocator& alloc)
+        : node{parent, ref_count}
+        , m_allocator{alloc} {}
+
+    void join(task_group_context*) {/*dummy, required only for reduction algorithms*/};
+
+    template <typename Task>
+    static void mark_task_stolen(Task &t) {
+        std::atomic<bool> &flag = static_cast<tree_node*>(t.my_parent)->m_child_stolen;
+#if TBB_USE_PROFILING_TOOLS
+        // Threading tools respect lock prefix but report false-positive data-race via plain store
+        flag.exchange(true);
+#else
+        flag.store(true, std::memory_order_relaxed);
+#endif // TBB_USE_PROFILING_TOOLS
+    }
+    template <typename Task>
+    static bool is_peer_stolen(Task &t) {
+        return static_cast<tree_node*>(t.my_parent)->m_child_stolen.load(std::memory_order_relaxed);
+    }
+};
+
+// Context used to check cancellation state during reduction join process
+template<typename TreeNodeType>
+void fold_tree(node* n, const execution_data& ed) {
+    for (;;) {
+        __TBB_ASSERT(n->m_ref_count.load(std::memory_order_relaxed) > 0, "The refcount must be positive.");
+        call_itt_task_notify(releasing, n);
+        if (--n->m_ref_count > 0) {
+            return;
+        }
+        node* parent = n->my_parent;
+        if (!parent) {
+            break;
+        };
+
+        call_itt_task_notify(acquired, n);
+        TreeNodeType* self = static_cast<TreeNodeType*>(n);
+        self->join(ed.context);
+        self->m_allocator.delete_object(self, ed);
+        n = parent;
+    }
+    // Finish parallel for execution when the root (last node) is reached
+    static_cast<wait_node*>(n)->m_wait.release();
+}
+
+//! Depth is a relative depth of recursive division inside a range pool. Relative depth allows
+//! infinite absolute depth of the recursion for heavily unbalanced workloads with range represented
+//! by a number that cannot fit into machine word.
+typedef unsigned char depth_t;
+
+//! Range pool stores ranges of type T in a circular buffer with MaxCapacity
+template <typename T, depth_t MaxCapacity>
+class range_vector {
+    depth_t my_head;
+    depth_t my_tail;
+    depth_t my_size;
+    depth_t my_depth[MaxCapacity]; // relative depths of stored ranges
+    tbb::detail::aligned_space<T, MaxCapacity> my_pool;
+
+public:
+    //! initialize via first range in pool
+    range_vector(const T& elem) : my_head(0), my_tail(0), my_size(1) {
+        my_depth[0] = 0;
+        new( static_cast<void *>(my_pool.begin()) ) T(elem);//TODO: std::move?
+    }
+    ~range_vector() {
+        while( !empty() ) pop_back();
+    }
+    bool empty() const { return my_size == 0; }
+    depth_t size() const { return my_size; }
+    //! Populates range pool via ranges up to max depth or while divisible
+    //! max_depth starts from 0, e.g. value 2 makes 3 ranges in the pool up to two 1/4 pieces
+    void split_to_fill(depth_t max_depth) {
+        while( my_size < MaxCapacity && is_divisible(max_depth) ) {
+            depth_t prev = my_head;
+            my_head = (my_head + 1) % MaxCapacity;
+            new(my_pool.begin()+my_head) T(my_pool.begin()[prev]); // copy TODO: std::move?
+            my_pool.begin()[prev].~T(); // instead of assignment
+            new(my_pool.begin()+prev) T(my_pool.begin()[my_head], detail::split()); // do 'inverse' split
+            my_depth[my_head] = ++my_depth[prev];
+            my_size++;
+        }
+    }
+    void pop_back() {
+        __TBB_ASSERT(my_size > 0, "range_vector::pop_back() with empty size");
+        my_pool.begin()[my_head].~T();
+        my_size--;
+        my_head = (my_head + MaxCapacity - 1) % MaxCapacity;
+    }
+    void pop_front() {
+        __TBB_ASSERT(my_size > 0, "range_vector::pop_front() with empty size");
+        my_pool.begin()[my_tail].~T();
+        my_size--;
+        my_tail = (my_tail + 1) % MaxCapacity;
+    }
+    T& back() {
+        __TBB_ASSERT(my_size > 0, "range_vector::back() with empty size");
+        return my_pool.begin()[my_head];
+    }
+    T& front() {
+        __TBB_ASSERT(my_size > 0, "range_vector::front() with empty size");
+        return my_pool.begin()[my_tail];
+    }
+    //! similarly to front(), returns depth of the first range in the pool
+    depth_t front_depth() {
+        __TBB_ASSERT(my_size > 0, "range_vector::front_depth() with empty size");
+        return my_depth[my_tail];
+    }
+    depth_t back_depth() {
+        __TBB_ASSERT(my_size > 0, "range_vector::back_depth() with empty size");
+        return my_depth[my_head];
+    }
+    bool is_divisible(depth_t max_depth) {
+        return back_depth() < max_depth && back().is_divisible();
+    }
+};
+
+//! Provides default methods for partition objects and common algorithm blocks.
+template <typename Partition>
+struct partition_type_base {
+    typedef detail::split split_type;
+    // decision makers
+    void note_affinity( slot_id ) {}
+    template <typename Task>
+    bool check_being_stolen(Task&, const execution_data&) { return false; } // part of old should_execute_range()
+    template <typename Range> split_type get_split() { return split(); }
+    Partition& self() { return *static_cast<Partition*>(this); } // CRTP helper
+
+    template<typename StartType, typename Range>
+    void work_balance(StartType &start, Range &range, const execution_data&) {
+        start.run_body( range ); // simple partitioner goes always here
+    }
+
+    template<typename StartType, typename Range>
+    void execute(StartType &start, Range &range, execution_data& ed) {
+        // The algorithm in a few words ([]-denotes calls to decision methods of partitioner):
+        // [If this task is stolen, adjust depth and divisions if necessary, set flag].
+        // If range is divisible {
+        //    Spread the work while [initial divisions left];
+        //    Create trap task [if necessary];
+        // }
+        // If not divisible or [max depth is reached], execute, else do the range pool part
+        if ( range.is_divisible() ) {
+            if ( self().is_divisible() ) {
+                do { // split until is divisible
+                    typename Partition::split_type split_obj = self().template get_split<Range>();
+                    start.offer_work( split_obj, ed );
+                } while ( range.is_divisible() && self().is_divisible() );
+            }
+        }
+        self().work_balance(start, range, ed);
+    }
+};
+
+//! Provides default splitting strategy for partition objects.
+template <typename Partition>
+struct adaptive_mode : partition_type_base<Partition> {
+    typedef Partition my_partition;
+    std::size_t my_divisor;
+    // For affinity_partitioner, my_divisor indicates the number of affinity array indices the task reserves.
+    // A task which has only one index must produce the right split without reserved index in order to avoid
+    // it to be overwritten in note_affinity() of the created (right) task.
+    // I.e. a task created deeper than the affinity array can remember must not save its affinity (LIFO order)
+    static const unsigned factor = 1;
+    adaptive_mode() : my_divisor(get_initial_auto_partitioner_divisor() / 4 * my_partition::factor) {}
+    adaptive_mode(adaptive_mode &src, split) : my_divisor(do_split(src, split())) {}
+    /*! Override do_split methods in order to specify splitting strategy */
+    std::size_t do_split(adaptive_mode &src, split) {
+        return src.my_divisor /= 2u;
+    }
+};
+
+//! Helper type for checking availability of proportional_split constructor
+template <typename T> using supports_proportional_splitting = typename std::is_constructible<T, T&, proportional_split&>;
+
+//! A helper class to create a proportional_split object for a given type of Range.
+/** If the Range has proportional_split constructor,
+    then created object splits a provided value in an implemenation-defined proportion;
+    otherwise it represents equal-size split. */
+// TODO: check if this helper can be a nested class of proportional_mode.
+template <typename Range, typename = void>
+struct proportion_helper {
+    static proportional_split get_split(std::size_t) { return proportional_split(1,1); }
+};
+
+template <typename Range>
+struct proportion_helper<Range, typename std::enable_if<supports_proportional_splitting<Range>::value>::type> {
+    static proportional_split get_split(std::size_t n) {
+        std::size_t right = n / 2;
+        std::size_t left  = n - right;
+        return proportional_split(left, right);
+    }
+};
+
+//! Provides proportional splitting strategy for partition objects
+template <typename Partition>
+struct proportional_mode : adaptive_mode<Partition> {
+    typedef Partition my_partition;
+    using partition_type_base<Partition>::self; // CRTP helper to get access to derived classes
+
+    proportional_mode() : adaptive_mode<Partition>() {}
+    proportional_mode(proportional_mode &src, split) : adaptive_mode<Partition>(src, split()) {}
+    proportional_mode(proportional_mode &src, const proportional_split& split_obj) { self().my_divisor = do_split(src, split_obj); }
+    std::size_t do_split(proportional_mode &src, const proportional_split& split_obj) {
+        std::size_t portion = split_obj.right() * my_partition::factor;
+        portion = (portion + my_partition::factor/2) & (0ul - my_partition::factor);
+        src.my_divisor -= portion;
+        return portion;
+    }
+    bool is_divisible() { // part of old should_execute_range()
+        return self().my_divisor > my_partition::factor;
+    }
+    template <typename Range>
+    proportional_split get_split() {
+        // Create a proportion for the number of threads expected to handle "this" subrange
+        return proportion_helper<Range>::get_split( self().my_divisor / my_partition::factor );
+    }
+};
+
+static std::size_t get_initial_partition_head() {
+    int current_index = tbb::this_task_arena::current_thread_index();
+    if (current_index == tbb::task_arena::not_initialized)
+        current_index = 0;
+    return size_t(current_index);
+}
+
+//! Provides default linear indexing of partitioner's sequence
+template <typename Partition>
+struct linear_affinity_mode : proportional_mode<Partition> {
+    std::size_t my_head;
+    std::size_t my_max_affinity;
+    using proportional_mode<Partition>::self;
+    linear_affinity_mode() : proportional_mode<Partition>(), my_head(get_initial_partition_head()),
+                             my_max_affinity(self().my_divisor) {}
+    linear_affinity_mode(linear_affinity_mode &src, split) : proportional_mode<Partition>(src, split())
+        , my_head((src.my_head + src.my_divisor) % src.my_max_affinity), my_max_affinity(src.my_max_affinity) {}
+    linear_affinity_mode(linear_affinity_mode &src, const proportional_split& split_obj) : proportional_mode<Partition>(src, split_obj)
+        , my_head((src.my_head + src.my_divisor) % src.my_max_affinity), my_max_affinity(src.my_max_affinity) {}
+    void spawn_task(task& t, task_group_context& ctx) {
+        if (self().my_divisor) {
+            spawn(t, ctx, slot_id(my_head));
+        } else {
+            spawn(t, ctx);
+        }
+    }
+};
+
+static bool is_stolen_task(const execution_data& ed) {
+    return execution_slot(ed) != original_slot(ed);
+}
+
+/*! Determine work-balance phase implementing splitting & stealing actions */
+template<class Mode>
+struct dynamic_grainsize_mode : Mode {
+    using Mode::self;
+    enum {
+        begin = 0,
+        run,
+        pass
+    } my_delay;
+    depth_t my_max_depth;
+    static const unsigned range_pool_size = __TBB_RANGE_POOL_CAPACITY;
+    dynamic_grainsize_mode(): Mode()
+        , my_delay(begin)
+        , my_max_depth(__TBB_INIT_DEPTH) {}
+    dynamic_grainsize_mode(dynamic_grainsize_mode& p, split)
+        : Mode(p, split())
+        , my_delay(pass)
+        , my_max_depth(p.my_max_depth) {}
+    dynamic_grainsize_mode(dynamic_grainsize_mode& p, const proportional_split& split_obj)
+        : Mode(p, split_obj)
+        , my_delay(begin)
+        , my_max_depth(p.my_max_depth) {}
+    template <typename Task>
+    bool check_being_stolen(Task &t, const execution_data& ed) { // part of old should_execute_range()
+        if( !(self().my_divisor / Mode::my_partition::factor) ) { // if not from the top P tasks of binary tree
+            self().my_divisor = 1; // TODO: replace by on-stack flag (partition_state's member)?
+            if( is_stolen_task(ed) && t.my_parent->m_ref_count >= 2 ) { // runs concurrently with the left task
+#if __TBB_USE_OPTIONAL_RTTI
+                // RTTI is available, check whether the cast is valid
+                // TODO: TBB_REVAMP_TODO __TBB_ASSERT(dynamic_cast<tree_node*>(t.m_parent), 0);
+                // correctness of the cast relies on avoiding the root task for which:
+                // - initial value of my_divisor != 0 (protected by separate assertion)
+                // - is_stolen_task() always returns false for the root task.
+#endif
+                tree_node::mark_task_stolen(t);
+                if( !my_max_depth ) my_max_depth++;
+                my_max_depth += __TBB_DEMAND_DEPTH_ADD;
+                return true;
+            }
+        }
+        return false;
+    }
+    depth_t max_depth() { return my_max_depth; }
+    void align_depth(depth_t base) {
+        __TBB_ASSERT(base <= my_max_depth, 0);
+        my_max_depth -= base;
+    }
+    template<typename StartType, typename Range>
+    void work_balance(StartType &start, Range &range, execution_data& ed) {
+        if( !range.is_divisible() || !self().max_depth() ) {
+            start.run_body( range ); // simple partitioner goes always here
+        }
+        else { // do range pool
+            range_vector<Range, range_pool_size> range_pool(range);
+            do {
+                range_pool.split_to_fill(self().max_depth()); // fill range pool
+                if( self().check_for_demand( start ) ) {
+                    if( range_pool.size() > 1 ) {
+                        start.offer_work( range_pool.front(), range_pool.front_depth(), ed );
+                        range_pool.pop_front();
+                        continue;
+                    }
+                    if( range_pool.is_divisible(self().max_depth()) ) // was not enough depth to fork a task
+                        continue; // note: next split_to_fill() should split range at least once
+                }
+                start.run_body( range_pool.back() );
+                range_pool.pop_back();
+            } while( !range_pool.empty() && !ed.context->is_group_execution_cancelled() );
+        }
+    }
+    template <typename Task>
+    bool check_for_demand(Task& t) {
+        if ( pass == my_delay ) {
+            if ( self().my_divisor > 1 ) // produce affinitized tasks while they have slot in array
+                return true; // do not do my_max_depth++ here, but be sure range_pool is splittable once more
+            else if ( self().my_divisor && my_max_depth ) { // make balancing task
+                self().my_divisor = 0; // once for each task; depth will be decreased in align_depth()
+                return true;
+            }
+            else if ( tree_node::is_peer_stolen(t) ) {
+                my_max_depth += __TBB_DEMAND_DEPTH_ADD;
+                return true;
+            }
+        } else if( begin == my_delay ) {
+            my_delay = pass;
+        }
+        return false;
+    }
+};
+
+class auto_partition_type: public dynamic_grainsize_mode<adaptive_mode<auto_partition_type> > {
+public:
+    auto_partition_type( const auto_partitioner& )
+        : dynamic_grainsize_mode<adaptive_mode<auto_partition_type> >() {
+        my_divisor *= __TBB_INITIAL_CHUNKS;
+    }
+    auto_partition_type( auto_partition_type& src, split)
+        : dynamic_grainsize_mode<adaptive_mode<auto_partition_type> >(src, split()) {}
+    bool is_divisible() { // part of old should_execute_range()
+        if( my_divisor > 1 ) return true;
+        if( my_divisor && my_max_depth ) { // can split the task. TODO: on-stack flag instead
+            // keep same fragmentation while splitting for the local task pool
+            my_max_depth--;
+            my_divisor = 0; // decrease max_depth once per task
+            return true;
+        } else return false;
+    }
+    template <typename Task>
+    bool check_for_demand(Task& t) {
+        if (tree_node::is_peer_stolen(t)) {
+            my_max_depth += __TBB_DEMAND_DEPTH_ADD;
+            return true;
+        } else return false;
+    }
+    void spawn_task(task& t, task_group_context& ctx) {
+        spawn(t, ctx);
+    }
+};
+
+class simple_partition_type: public partition_type_base<simple_partition_type> {
+public:
+    simple_partition_type( const simple_partitioner& ) {}
+    simple_partition_type( const simple_partition_type&, split ) {}
+    //! simplified algorithm
+    template<typename StartType, typename Range>
+    void execute(StartType &start, Range &range, execution_data& ed) {
+        split_type split_obj = split(); // start.offer_work accepts split_type as reference
+        while( range.is_divisible() )
+            start.offer_work( split_obj, ed );
+        start.run_body( range );
+    }
+    void spawn_task(task& t, task_group_context& ctx) {
+        spawn(t, ctx);
+    }
+};
+
+class static_partition_type : public linear_affinity_mode<static_partition_type> {
+public:
+    typedef detail::proportional_split split_type;
+    static_partition_type( const static_partitioner& )
+        : linear_affinity_mode<static_partition_type>() {}
+    static_partition_type( static_partition_type& p, const proportional_split& split_obj )
+        : linear_affinity_mode<static_partition_type>(p, split_obj) {}
+};
+
+class affinity_partition_type : public dynamic_grainsize_mode<linear_affinity_mode<affinity_partition_type> > {
+    static const unsigned factor_power = 4; // TODO: get a unified formula based on number of computing units
+    slot_id* my_array;
+public:
+    static const unsigned factor = 1 << factor_power; // number of slots in affinity array per task
+    typedef detail::proportional_split split_type;
+    affinity_partition_type( affinity_partitioner_base& ap )
+        : dynamic_grainsize_mode<linear_affinity_mode<affinity_partition_type> >() {
+        __TBB_ASSERT( (factor&(factor-1))==0, "factor must be power of two" );
+        ap.resize(factor);
+        my_array = ap.my_array;
+        my_max_depth = factor_power + 1;
+        __TBB_ASSERT( my_max_depth < __TBB_RANGE_POOL_CAPACITY, 0 );
+    }
+    affinity_partition_type(affinity_partition_type& p, split)
+        : dynamic_grainsize_mode<linear_affinity_mode<affinity_partition_type> >(p, split())
+        , my_array(p.my_array) {}
+    affinity_partition_type(affinity_partition_type& p, const proportional_split& split_obj)
+        : dynamic_grainsize_mode<linear_affinity_mode<affinity_partition_type> >(p, split_obj)
+        , my_array(p.my_array) {}
+    void note_affinity(slot_id id) {
+        if( my_divisor )
+            my_array[my_head] = id;
+    }
+    void spawn_task(task& t, task_group_context& ctx) {
+        if (my_divisor) {
+            if (!my_array[my_head]) {
+                // TODO: consider new ideas with my_array for both affinity and static partitioner's, then code reuse
+                spawn(t, ctx, slot_id(my_head / factor));
+            } else {
+                spawn(t, ctx, my_array[my_head]);
+            }
+        } else {
+            spawn(t, ctx);
+        }
+    }
+};
+
+//! A simple partitioner
+/** Divides the range until the range is not divisible.
+    @ingroup algorithms */
+class simple_partitioner {
+public:
+    simple_partitioner() {}
+private:
+    template<typename Range, typename Body, typename Partitioner> friend struct start_for;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_reduce;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_deterministic_reduce;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_scan;
+    // new implementation just extends existing interface
+    typedef simple_partition_type task_partition_type;
+    // TODO: consider to make split_type public
+    typedef simple_partition_type::split_type split_type;
+
+    // for parallel_scan only
+    class partition_type {
+    public:
+        bool should_execute_range(const execution_data& ) {return false;}
+        partition_type( const simple_partitioner& ) {}
+        partition_type( const partition_type&, split ) {}
+    };
+};
+
+//! An auto partitioner
+/** The range is initial divided into several large chunks.
+    Chunks are further subdivided into smaller pieces if demand detected and they are divisible.
+    @ingroup algorithms */
+class auto_partitioner {
+public:
+    auto_partitioner() {}
+
+private:
+    template<typename Range, typename Body, typename Partitioner> friend struct start_for;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_reduce;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_deterministic_reduce;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_scan;
+    // new implementation just extends existing interface
+    typedef auto_partition_type task_partition_type;
+    // TODO: consider to make split_type public
+    typedef auto_partition_type::split_type split_type;
+
+    //! Backward-compatible partition for auto and affinity partition objects.
+    class partition_type {
+        size_t num_chunks;
+        static const size_t VICTIM_CHUNKS = 4;
+        public:
+        bool should_execute_range(const execution_data& ed) {
+            if( num_chunks<VICTIM_CHUNKS && is_stolen_task(ed) )
+                num_chunks = VICTIM_CHUNKS;
+            return num_chunks==1;
+        }
+        partition_type( const auto_partitioner& )
+            : num_chunks(get_initial_auto_partitioner_divisor()*__TBB_INITIAL_CHUNKS/4) {}
+        partition_type( partition_type& pt, split ) {
+            num_chunks = pt.num_chunks = (pt.num_chunks+1u) / 2u;
+        }
+    };
+};
+
+//! A static partitioner
+class static_partitioner {
+public:
+    static_partitioner() {}
+private:
+    template<typename Range, typename Body, typename Partitioner> friend struct start_for;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_reduce;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_deterministic_reduce;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_scan;
+    // new implementation just extends existing interface
+    typedef static_partition_type task_partition_type;
+    // TODO: consider to make split_type public
+    typedef static_partition_type::split_type split_type;
+};
+
+//! An affinity partitioner
+class affinity_partitioner : affinity_partitioner_base {
+public:
+    affinity_partitioner() {}
+
+private:
+    template<typename Range, typename Body, typename Partitioner> friend struct start_for;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_reduce;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_deterministic_reduce;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_scan;
+    // new implementation just extends existing interface
+    typedef affinity_partition_type task_partition_type;
+    // TODO: consider to make split_type public
+    typedef affinity_partition_type::split_type split_type;
+};
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+// Partitioners
+using detail::d1::auto_partitioner;
+using detail::d1::simple_partitioner;
+using detail::d1::static_partitioner;
+using detail::d1::affinity_partitioner;
+// Split types
+using detail::split;
+using detail::proportional_split;
+} // namespace v1
+
+} // namespace tbb
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+    #pragma warning (pop)
+#endif // warning 4244 is back
+
+#undef __TBB_INITIAL_CHUNKS
+#undef __TBB_RANGE_POOL_CAPACITY
+#undef __TBB_INIT_DEPTH
+
+#endif /* __TBB_partitioner_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/profiling.h b/contrib/libs/tbb/include/oneapi/tbb/profiling.h
new file mode 100644
index 0000000000..4b62da2060
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/profiling.h
@@ -0,0 +1,243 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_profiling_H
+#define __TBB_profiling_H
+
+#include "detail/_config.h"
+#include <cstdint>
+
+#include <string>
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+    // include list of index names
+    #define TBB_STRING_RESOURCE(index_name,str) index_name,
+    enum string_resource_index : std::uintptr_t {
+        #include "detail/_string_resource.h"
+        NUM_STRINGS
+    };
+    #undef TBB_STRING_RESOURCE
+
+    enum itt_relation
+    {
+    __itt_relation_is_unknown = 0,
+    __itt_relation_is_dependent_on,         /**< "A is dependent on B" means that A cannot start until B completes */
+    __itt_relation_is_sibling_of,           /**< "A is sibling of B" means that A and B were created as a group */
+    __itt_relation_is_parent_of,            /**< "A is parent of B" means that A created B */
+    __itt_relation_is_continuation_of,      /**< "A is continuation of B" means that A assumes the dependencies of B */
+    __itt_relation_is_child_of,             /**< "A is child of B" means that A was created by B (inverse of is_parent_of) */
+    __itt_relation_is_continued_by,         /**< "A is continued by B" means that B assumes the dependencies of A (inverse of is_continuation_of) */
+    __itt_relation_is_predecessor_to        /**< "A is predecessor to B" means that B cannot start until A completes (inverse of is_dependent_on) */
+    };
+
+//! Unicode support
+#if (_WIN32||_WIN64) && !__MINGW32__
+    //! Unicode character type. Always wchar_t on Windows.
+    using tchar = wchar_t;
+#else /* !WIN */
+    using tchar = char;
+#endif /* !WIN */
+
+} // namespace d0
+} // namespace detail
+} // namespace tbb
+
+#include <atomic>
+#if _WIN32||_WIN64
+#include <stdlib.h>  /* mbstowcs_s */
+#endif
+// Need these to work regardless of tools support
+namespace tbb {
+namespace detail {
+namespace d1 {
+    enum notify_type {prepare=0, cancel, acquired, releasing, destroy};
+    enum itt_domain_enum { ITT_DOMAIN_FLOW=0, ITT_DOMAIN_MAIN=1, ITT_DOMAIN_ALGO=2, ITT_NUM_DOMAINS };
+} // namespace d1
+
+namespace r1 {
+    void __TBB_EXPORTED_FUNC call_itt_notify(int t, void* ptr);
+    void __TBB_EXPORTED_FUNC create_itt_sync(void* ptr, const tchar* objtype, const tchar* objname);
+    void __TBB_EXPORTED_FUNC itt_make_task_group(d1::itt_domain_enum domain, void* group, unsigned long long group_extra,
+        void* parent, unsigned long long parent_extra, string_resource_index name_index);
+    void __TBB_EXPORTED_FUNC itt_task_begin(d1::itt_domain_enum domain, void* task, unsigned long long task_extra,
+        void* parent, unsigned long long parent_extra, string_resource_index name_index);
+    void __TBB_EXPORTED_FUNC itt_task_end(d1::itt_domain_enum domain);
+    void __TBB_EXPORTED_FUNC itt_set_sync_name(void* obj, const tchar* name);
+    void __TBB_EXPORTED_FUNC itt_metadata_str_add(d1::itt_domain_enum domain, void* addr, unsigned long long addr_extra,
+        string_resource_index key, const char* value);
+    void __TBB_EXPORTED_FUNC itt_metadata_ptr_add(d1::itt_domain_enum domain, void* addr, unsigned long long addr_extra,
+        string_resource_index key, void* value);
+    void __TBB_EXPORTED_FUNC itt_relation_add(d1::itt_domain_enum domain, void* addr0, unsigned long long addr0_extra,
+        itt_relation relation, void* addr1, unsigned long long addr1_extra);
+    void __TBB_EXPORTED_FUNC itt_region_begin(d1::itt_domain_enum domain, void* region, unsigned long long region_extra,
+        void* parent, unsigned long long parent_extra, string_resource_index /* name_index */);
+    void __TBB_EXPORTED_FUNC itt_region_end(d1::itt_domain_enum domain, void* region, unsigned long long region_extra);
+} // namespace r1
+
+namespace d1 {
+#if TBB_USE_PROFILING_TOOLS && (_WIN32||_WIN64) && !__MINGW32__
+    inline std::size_t multibyte_to_widechar(wchar_t* wcs, const char* mbs, std::size_t bufsize) {
+        std::size_t len;
+        mbstowcs_s(&len, wcs, bufsize, mbs, _TRUNCATE);
+        return len;   // mbstowcs_s counts null terminator
+    }
+#endif
+
+#if TBB_USE_PROFILING_TOOLS
+    inline void create_itt_sync(void *ptr, const char *objtype, const char *objname) {
+#if (_WIN32||_WIN64) && !__MINGW32__
+        std::size_t len_type = multibyte_to_widechar(nullptr, objtype, 0);
+        wchar_t *type = new wchar_t[len_type];
+        multibyte_to_widechar(type, objtype, len_type);
+        std::size_t len_name = multibyte_to_widechar(nullptr, objname, 0);
+        wchar_t *name = new wchar_t[len_name];
+        multibyte_to_widechar(name, objname, len_name);
+#else // WIN
+        const char *type = objtype;
+        const char *name = objname;
+#endif
+        r1::create_itt_sync(ptr, type, name);
+
+#if (_WIN32||_WIN64) && !__MINGW32__
+        delete[] type;
+        delete[] name;
+#endif // WIN
+    }
+
+// Distinguish notifications on task for reducing overheads
+#if TBB_USE_PROFILING_TOOLS == 2
+    inline void call_itt_task_notify(d1::notify_type t, void *ptr) {
+        r1::call_itt_notify((int)t, ptr);
+    }
+#else
+    inline void call_itt_task_notify(d1::notify_type, void *) {}
+#endif // TBB_USE_PROFILING_TOOLS
+
+    inline void call_itt_notify(d1::notify_type t, void *ptr) {
+        r1::call_itt_notify((int)t, ptr);
+    }
+
+#if (_WIN32||_WIN64) && !__MINGW32__
+    inline void itt_set_sync_name(void* obj, const wchar_t* name) {
+        r1::itt_set_sync_name(obj, name);
+    }
+    inline void itt_set_sync_name(void* obj, const char* name) {
+        std::size_t len_name = multibyte_to_widechar(nullptr, name, 0);
+        wchar_t *obj_name = new wchar_t[len_name];
+        multibyte_to_widechar(obj_name, name, len_name);
+        r1::itt_set_sync_name(obj, obj_name);
+        delete[] obj_name;
+    }
+#else
+    inline void itt_set_sync_name( void* obj, const char* name) {
+        r1::itt_set_sync_name(obj, name);
+    }
+#endif //WIN
+
+    inline void itt_make_task_group(itt_domain_enum domain, void* group, unsigned long long group_extra,
+        void* parent, unsigned long long parent_extra, string_resource_index name_index) {
+        r1::itt_make_task_group(domain, group, group_extra, parent, parent_extra, name_index);
+    }
+
+    inline void itt_metadata_str_add( itt_domain_enum domain, void *addr, unsigned long long addr_extra,
+                                        string_resource_index key, const char *value ) {
+        r1::itt_metadata_str_add( domain, addr, addr_extra, key, value );
+    }
+
+    inline void register_node_addr(itt_domain_enum domain, void *addr, unsigned long long addr_extra,
+        string_resource_index key, void *value) {
+        r1::itt_metadata_ptr_add(domain, addr, addr_extra, key, value);
+    }
+
+    inline void itt_relation_add( itt_domain_enum domain, void *addr0, unsigned long long addr0_extra,
+                                    itt_relation relation, void *addr1, unsigned long long addr1_extra ) {
+        r1::itt_relation_add( domain, addr0, addr0_extra, relation, addr1, addr1_extra );
+    }
+
+    inline void itt_task_begin( itt_domain_enum domain, void *task, unsigned long long task_extra,
+                                                    void *parent, unsigned long long parent_extra, string_resource_index name_index ) {
+        r1::itt_task_begin( domain, task, task_extra, parent, parent_extra, name_index );
+    }
+
+    inline void itt_task_end( itt_domain_enum domain ) {
+        r1::itt_task_end( domain );
+    }
+
+    inline void itt_region_begin( itt_domain_enum domain, void *region, unsigned long long region_extra,
+                                    void *parent, unsigned long long parent_extra, string_resource_index name_index ) {
+        r1::itt_region_begin( domain, region, region_extra, parent, parent_extra, name_index );
+    }
+
+    inline void itt_region_end( itt_domain_enum domain, void *region, unsigned long long region_extra  ) {
+        r1::itt_region_end( domain, region, region_extra );
+    }
+#else
+    inline void create_itt_sync(void* /*ptr*/, const char* /*objtype*/, const char* /*objname*/) {}
+
+    inline void call_itt_notify(notify_type /*t*/, void* /*ptr*/) {}
+
+    inline void call_itt_task_notify(notify_type /*t*/, void* /*ptr*/) {}
+#endif // TBB_USE_PROFILING_TOOLS
+
+#if TBB_USE_PROFILING_TOOLS && !(TBB_USE_PROFILING_TOOLS == 2)
+class event {
+/** This class supports user event traces through itt.
+    Common use-case is tagging data flow graph tasks (data-id)
+    and visualization by Intel Advisor Flow Graph Analyzer (FGA)  **/
+//  TODO: Replace implementation by itt user event api.
+
+    const std::string my_name;
+
+    static void emit_trace(const std::string &input) {
+        itt_metadata_str_add( ITT_DOMAIN_FLOW, NULL, FLOW_NULL, USER_EVENT, ( "FGA::DATAID::" + input ).c_str() );
+    }
+
+public:
+    event(const std::string &input)
+              : my_name( input )
+    { }
+
+    void emit() {
+        emit_trace(my_name);
+    }
+
+    static void emit(const std::string &description) {
+        emit_trace(description);
+    }
+
+};
+#else // TBB_USE_PROFILING_TOOLS && !(TBB_USE_PROFILING_TOOLS == 2)
+// Using empty struct if user event tracing is disabled:
+struct event {
+    event(const std::string &) { }
+
+    void emit() { }
+
+    static void emit(const std::string &) { }
+};
+#endif // TBB_USE_PROFILING_TOOLS && !(TBB_USE_PROFILING_TOOLS == 2)
+} // namespace d1
+} // namespace detail
+
+namespace profiling {
+    using detail::d1::event;
+}
+} // namespace tbb
+
+
+#endif /* __TBB_profiling_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/queuing_mutex.h b/contrib/libs/tbb/include/oneapi/tbb/queuing_mutex.h
new file mode 100644
index 0000000000..6c3f1fe1e9
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/queuing_mutex.h
@@ -0,0 +1,197 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_queuing_mutex_H
+#define __TBB_queuing_mutex_H
+
+#include "detail/_namespace_injection.h"
+#include "detail/_assert.h"
+#include "detail/_utils.h"
+
+#include "profiling.h"
+
+#include <atomic>
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! Queuing mutex with local-only spinning.
+/** @ingroup synchronization */
+class queuing_mutex {
+public:
+    //! Construct unacquired mutex.
+    queuing_mutex() noexcept  {
+        create_itt_sync(this, "tbb::queuing_mutex", "");
+    };
+
+    queuing_mutex(const queuing_mutex&) = delete;
+    queuing_mutex& operator=(const queuing_mutex&) = delete;
+
+    //! The scoped locking pattern
+    /** It helps to avoid the common problem of forgetting to release lock.
+        It also nicely provides the "node" for queuing locks. */
+    class scoped_lock {
+        //! Reset fields to mean "no lock held".
+        void reset() {
+            m_mutex = nullptr;
+        }
+
+    public:
+        //! Construct lock that has not acquired a mutex.
+        /** Equivalent to zero-initialization of *this. */
+        scoped_lock() = default;
+
+        //! Acquire lock on given mutex.
+        scoped_lock(queuing_mutex& m) {
+            acquire(m);
+        }
+
+        //! Release lock (if lock is held).
+        ~scoped_lock() {
+            if (m_mutex) release();
+        }
+
+        //! No Copy
+        scoped_lock( const scoped_lock& ) = delete;
+        scoped_lock& operator=( const scoped_lock& ) = delete;
+
+        //! Acquire lock on given mutex.
+        void acquire( queuing_mutex& m ) {
+            __TBB_ASSERT(!m_mutex, "scoped_lock is already holding a mutex");
+
+            // Must set all fields before the exchange, because once the
+            // exchange executes, *this becomes accessible to other threads.
+            m_mutex = &m;
+            m_next.store(nullptr, std::memory_order_relaxed);
+            m_going.store(0U, std::memory_order_relaxed);
+
+            // x86 compare exchange operation always has a strong fence
+            // "sending" the fields initialized above to other processors.
+            scoped_lock* pred = m.q_tail.exchange(this);
+            if (pred) {
+                call_itt_notify(prepare, &m);
+                __TBB_ASSERT(pred->m_next.load(std::memory_order_relaxed) == nullptr, "the predecessor has another successor!");
+
+                pred->m_next.store(this, std::memory_order_relaxed);
+                spin_wait_while_eq(m_going, 0U);
+            }
+            call_itt_notify(acquired, &m);
+
+            // Force acquire so that user's critical section receives correct values
+            // from processor that was previously in the user's critical section.
+            atomic_fence(std::memory_order_acquire);
+        }
+
+        //! Acquire lock on given mutex if free (i.e. non-blocking)
+        bool try_acquire( queuing_mutex& m ) {
+            __TBB_ASSERT(!m_mutex, "scoped_lock is already holding a mutex");
+
+            // Must set all fields before the compare_exchange_strong, because once the
+            // compare_exchange_strong executes, *this becomes accessible to other threads.
+            m_next.store(nullptr, std::memory_order_relaxed);
+            m_going.store(0U, std::memory_order_relaxed);
+
+            scoped_lock* expected = nullptr;
+            // The compare_exchange_strong must have release semantics, because we are
+            // "sending" the fields initialized above to other processors.
+            // x86 compare exchange operation always has a strong fence
+            if (!m.q_tail.compare_exchange_strong(expected, this))
+                return false;
+
+            m_mutex = &m;
+
+            // Force acquire so that user's critical section receives correct values
+            // from processor that was previously in the user's critical section.
+            atomic_fence(std::memory_order_acquire);
+            call_itt_notify(acquired, &m);
+            return true;
+        }
+
+        //! Release lock.
+        void release()
+        {
+            __TBB_ASSERT(this->m_mutex, "no lock acquired");
+
+            call_itt_notify(releasing, this->m_mutex);
+
+            if (m_next.load(std::memory_order_relaxed) == nullptr) {
+                scoped_lock* expected = this;
+                if (m_mutex->q_tail.compare_exchange_strong(expected, nullptr)) {
+                    // this was the only item in the queue, and the queue is now empty.
+                    reset();
+                    return;
+                }
+                // Someone in the queue
+                spin_wait_while_eq(m_next, nullptr);
+            }
+            m_next.load(std::memory_order_relaxed)->m_going.store(1U, std::memory_order_release);
+
+            reset();
+        }
+
+    private:
+        //! The pointer to the mutex owned, or NULL if not holding a mutex.
+        queuing_mutex* m_mutex{nullptr};
+
+        //! The pointer to the next competitor for a mutex
+        std::atomic<scoped_lock*> m_next{nullptr};
+
+        //! The local spin-wait variable
+        /** Inverted (0 - blocked, 1 - acquired the mutex) for the sake of
+            zero-initialization.  Defining it as an entire word instead of
+            a byte seems to help performance slightly. */
+        std::atomic<uintptr_t> m_going{0U};
+    };
+
+    // Mutex traits
+    static constexpr bool is_rw_mutex = false;
+    static constexpr bool is_recursive_mutex = false;
+    static constexpr bool is_fair_mutex = true;
+
+private:
+    //! The last competitor requesting the lock
+    std::atomic<scoped_lock*> q_tail{nullptr};
+
+};
+
+#if TBB_USE_PROFILING_TOOLS
+inline void set_name(queuing_mutex& obj, const char* name) {
+    itt_set_sync_name(&obj, name);
+}
+#if (_WIN32||_WIN64) && !__MINGW32__
+inline void set_name(queuing_mutex& obj, const wchar_t* name) {
+    itt_set_sync_name(&obj, name);
+}
+#endif //WIN
+#else
+inline void set_name(queuing_mutex&, const char*) {}
+#if (_WIN32||_WIN64) && !__MINGW32__
+inline void set_name(queuing_mutex&, const wchar_t*) {}
+#endif //WIN
+#endif
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::queuing_mutex;
+} // namespace v1
+namespace profiling {
+    using detail::d1::set_name;
+}
+} // namespace tbb
+
+#endif /* __TBB_queuing_mutex_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/queuing_rw_mutex.h b/contrib/libs/tbb/include/oneapi/tbb/queuing_rw_mutex.h
new file mode 100644
index 0000000000..6bb748f8a3
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/queuing_rw_mutex.h
@@ -0,0 +1,199 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_queuing_rw_mutex_H
+#define __TBB_queuing_rw_mutex_H
+
+#include "detail/_config.h"
+#include "detail/_namespace_injection.h"
+#include "detail/_assert.h"
+
+#include "profiling.h"
+
+#include <cstring>
+#include <atomic>
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+struct queuing_rw_mutex_impl;
+}
+namespace d1 {
+
+//! Queuing reader-writer mutex with local-only spinning.
+/** Adapted from Krieger, Stumm, et al. pseudocode at
+    https://www.researchgate.net/publication/221083709_A_Fair_Fast_Scalable_Reader-Writer_Lock
+    @ingroup synchronization */
+class queuing_rw_mutex {
+    friend r1::queuing_rw_mutex_impl;
+public:
+    //! Construct unacquired mutex.
+    queuing_rw_mutex() noexcept  {
+        create_itt_sync(this, "tbb::queuing_rw_mutex", "");
+    }
+
+    //! Destructor asserts if the mutex is acquired, i.e. q_tail is non-NULL
+    ~queuing_rw_mutex() {
+        __TBB_ASSERT(q_tail.load(std::memory_order_relaxed) == nullptr, "destruction of an acquired mutex");
+    }
+
+    //! No Copy
+    queuing_rw_mutex(const queuing_rw_mutex&) = delete;
+    queuing_rw_mutex& operator=(const queuing_rw_mutex&) = delete;
+
+    //! The scoped locking pattern
+    /** It helps to avoid the common problem of forgetting to release lock.
+        It also nicely provides the "node" for queuing locks. */
+    class scoped_lock {
+        friend r1::queuing_rw_mutex_impl;
+        //! Initialize fields to mean "no lock held".
+        void initialize() {
+            my_mutex = nullptr;
+            my_internal_lock.store(0, std::memory_order_relaxed);
+            my_going.store(0, std::memory_order_relaxed);
+#if TBB_USE_ASSERT
+            my_state = 0xFF; // Set to invalid state
+            my_next.store(reinterpret_cast<uintptr_t>(reinterpret_cast<void*>(-1)), std::memory_order_relaxed);
+            my_prev.store(reinterpret_cast<uintptr_t>(reinterpret_cast<void*>(-1)), std::memory_order_relaxed);
+#endif /* TBB_USE_ASSERT */
+        }
+
+    public:
+        //! Construct lock that has not acquired a mutex.
+        /** Equivalent to zero-initialization of *this. */
+        scoped_lock() {initialize();}
+
+        //! Acquire lock on given mutex.
+        scoped_lock( queuing_rw_mutex& m, bool write=true ) {
+            initialize();
+            acquire(m,write);
+        }
+
+        //! Release lock (if lock is held).
+        ~scoped_lock() {
+            if( my_mutex ) release();
+        }
+
+        //! No Copy
+        scoped_lock(const scoped_lock&) = delete;
+        scoped_lock& operator=(const scoped_lock&) = delete;
+
+        //! Acquire lock on given mutex.
+        void acquire( queuing_rw_mutex& m, bool write=true );
+
+        //! Acquire lock on given mutex if free (i.e. non-blocking)
+        bool try_acquire( queuing_rw_mutex& m, bool write=true );
+
+        //! Release lock.
+        void release();
+
+        //! Upgrade reader to become a writer.
+        /** Returns whether the upgrade happened without releasing and re-acquiring the lock */
+        bool upgrade_to_writer();
+
+        //! Downgrade writer to become a reader.
+        bool downgrade_to_reader();
+
+    private:
+        //! The pointer to the mutex owned, or NULL if not holding a mutex.
+        queuing_rw_mutex* my_mutex;
+
+        //! The 'pointer' to the previous and next competitors for a mutex
+        std::atomic<uintptr_t> my_prev;
+        std::atomic<uintptr_t> my_next;
+
+        using state_t = unsigned char ;
+
+        //! State of the request: reader, writer, active reader, other service states
+        std::atomic<state_t> my_state;
+
+        //! The local spin-wait variable
+        /** Corresponds to "spin" in the pseudocode but inverted for the sake of zero-initialization */
+        std::atomic<unsigned char> my_going;
+
+        //! A tiny internal lock
+        std::atomic<unsigned char> my_internal_lock;
+    };
+
+    // Mutex traits
+    static constexpr bool is_rw_mutex = true;
+    static constexpr bool is_recursive_mutex = false;
+    static constexpr bool is_fair_mutex = true;
+
+private:
+    //! The last competitor requesting the lock
+    std::atomic<scoped_lock*> q_tail{nullptr};
+};
+#if TBB_USE_PROFILING_TOOLS
+inline void set_name(queuing_rw_mutex& obj, const char* name) {
+    itt_set_sync_name(&obj, name);
+}
+#if (_WIN32||_WIN64) && !__MINGW32__
+inline void set_name(queuing_rw_mutex& obj, const wchar_t* name) {
+    itt_set_sync_name(&obj, name);
+}
+#endif //WIN
+#else
+inline void set_name(queuing_rw_mutex&, const char*) {}
+#if (_WIN32||_WIN64) && !__MINGW32__
+inline void set_name(queuing_rw_mutex&, const wchar_t*) {}
+#endif //WIN
+#endif
+} // namespace d1
+
+namespace r1 {
+void acquire(d1::queuing_rw_mutex&, d1::queuing_rw_mutex::scoped_lock&, bool);
+bool try_acquire(d1::queuing_rw_mutex&, d1::queuing_rw_mutex::scoped_lock&, bool);
+void release(d1::queuing_rw_mutex::scoped_lock&);
+bool upgrade_to_writer(d1::queuing_rw_mutex::scoped_lock&);
+bool downgrade_to_reader(d1::queuing_rw_mutex::scoped_lock&);
+} // namespace r1
+
+namespace d1 {
+
+
+inline void queuing_rw_mutex::scoped_lock::acquire(queuing_rw_mutex& m,bool write) {
+    r1::acquire(m, *this, write);
+}
+
+inline bool queuing_rw_mutex::scoped_lock::try_acquire(queuing_rw_mutex& m, bool write) {
+    return r1::try_acquire(m, *this, write);
+}
+
+inline void queuing_rw_mutex::scoped_lock::release() {
+    r1::release(*this);
+}
+
+inline bool queuing_rw_mutex::scoped_lock::upgrade_to_writer() {
+    return r1::upgrade_to_writer(*this);
+}
+
+inline bool queuing_rw_mutex::scoped_lock::downgrade_to_reader() {
+    return r1::downgrade_to_reader(*this);
+}
+} // namespace d1
+
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::queuing_rw_mutex;
+} // namespace v1
+namespace profiling {
+    using detail::d1::set_name;
+}
+} // namespace tbb
+
+#endif /* __TBB_queuing_rw_mutex_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/scalable_allocator.h b/contrib/libs/tbb/include/oneapi/tbb/scalable_allocator.h
new file mode 100644
index 0000000000..daab02f324
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/scalable_allocator.h
@@ -0,0 +1,332 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_scalable_allocator_H
+#define __TBB_scalable_allocator_H
+
+#ifdef __cplusplus
+#include "oneapi/tbb/detail/_config.h"
+#include "oneapi/tbb/detail/_utils.h"
+#include <cstdlib>
+#include <utility>
+#else
+#include <stddef.h> /* Need ptrdiff_t and size_t from here. */
+#if !_MSC_VER
+#include <stdint.h> /* Need intptr_t from here. */
+#endif
+#endif
+
+#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT
+#error #include <memory_resource>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#if _MSC_VER
+    #define __TBB_EXPORTED_FUNC __cdecl
+#else
+    #define __TBB_EXPORTED_FUNC
+#endif
+
+/** The "malloc" analogue to allocate block of memory of size bytes.
+  * @ingroup memory_allocation */
+void* __TBB_EXPORTED_FUNC scalable_malloc(size_t size);
+
+/** The "free" analogue to discard a previously allocated piece of memory.
+    @ingroup memory_allocation */
+void   __TBB_EXPORTED_FUNC scalable_free(void* ptr);
+
+/** The "realloc" analogue complementing scalable_malloc.
+    @ingroup memory_allocation */
+void* __TBB_EXPORTED_FUNC scalable_realloc(void* ptr, size_t size);
+
+/** The "calloc" analogue complementing scalable_malloc.
+    @ingroup memory_allocation */
+void* __TBB_EXPORTED_FUNC scalable_calloc(size_t nobj, size_t size);
+
+/** The "posix_memalign" analogue.
+    @ingroup memory_allocation */
+int __TBB_EXPORTED_FUNC scalable_posix_memalign(void** memptr, size_t alignment, size_t size);
+
+/** The "_aligned_malloc" analogue.
+    @ingroup memory_allocation */
+void* __TBB_EXPORTED_FUNC scalable_aligned_malloc(size_t size, size_t alignment);
+
+/** The "_aligned_realloc" analogue.
+    @ingroup memory_allocation */
+void* __TBB_EXPORTED_FUNC scalable_aligned_realloc(void* ptr, size_t size, size_t alignment);
+
+/** The "_aligned_free" analogue.
+    @ingroup memory_allocation */
+void __TBB_EXPORTED_FUNC scalable_aligned_free(void* ptr);
+
+/** The analogue of _msize/malloc_size/malloc_usable_size.
+    Returns the usable size of a memory block previously allocated by scalable_*,
+    or 0 (zero) if ptr does not point to such a block.
+    @ingroup memory_allocation */
+size_t __TBB_EXPORTED_FUNC scalable_msize(void* ptr);
+
+/* Results for scalable_allocation_* functions */
+typedef enum {
+    TBBMALLOC_OK,
+    TBBMALLOC_INVALID_PARAM,
+    TBBMALLOC_UNSUPPORTED,
+    TBBMALLOC_NO_MEMORY,
+    TBBMALLOC_NO_EFFECT
+} ScalableAllocationResult;
+
+/* Setting TBB_MALLOC_USE_HUGE_PAGES environment variable to 1 enables huge pages.
+   scalable_allocation_mode call has priority over environment variable. */
+typedef enum {
+    TBBMALLOC_USE_HUGE_PAGES,  /* value turns using huge pages on and off */
+    /* deprecated, kept for backward compatibility only */
+    USE_HUGE_PAGES = TBBMALLOC_USE_HUGE_PAGES,
+    /* try to limit memory consumption value (Bytes), clean internal buffers
+       if limit is exceeded, but not prevents from requesting memory from OS */
+    TBBMALLOC_SET_SOFT_HEAP_LIMIT,
+    /* Lower bound for the size (Bytes), that is interpreted as huge
+     * and not released during regular cleanup operations. */
+    TBBMALLOC_SET_HUGE_SIZE_THRESHOLD
+} AllocationModeParam;
+
+/** Set TBB allocator-specific allocation modes.
+    @ingroup memory_allocation */
+int __TBB_EXPORTED_FUNC scalable_allocation_mode(int param, intptr_t value);
+
+typedef enum {
+    /* Clean internal allocator buffers for all threads.
+       Returns TBBMALLOC_NO_EFFECT if no buffers cleaned,
+       TBBMALLOC_OK if some memory released from buffers. */
+    TBBMALLOC_CLEAN_ALL_BUFFERS,
+    /* Clean internal allocator buffer for current thread only.
+       Return values same as for TBBMALLOC_CLEAN_ALL_BUFFERS. */
+    TBBMALLOC_CLEAN_THREAD_BUFFERS
+} ScalableAllocationCmd;
+
+/** Call TBB allocator-specific commands.
+    @ingroup memory_allocation */
+int __TBB_EXPORTED_FUNC scalable_allocation_command(int cmd, void *param);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#ifdef __cplusplus
+
+//! The namespace rml contains components of low-level memory pool interface.
+namespace rml {
+class MemoryPool;
+
+typedef void *(*rawAllocType)(std::intptr_t pool_id, std::size_t &bytes);
+// returns non-zero in case of error
+typedef int   (*rawFreeType)(std::intptr_t pool_id, void* raw_ptr, std::size_t raw_bytes);
+
+struct MemPoolPolicy {
+    enum {
+        TBBMALLOC_POOL_VERSION = 1
+    };
+
+    rawAllocType pAlloc;
+    rawFreeType  pFree;
+                 // granularity of pAlloc allocations. 0 means default used.
+    std::size_t  granularity;
+    int          version;
+                 // all memory consumed at 1st pAlloc call and never returned,
+                 // no more pAlloc calls after 1st
+    unsigned     fixedPool : 1,
+                 // memory consumed but returned only at pool termination
+                 keepAllMemory : 1,
+                 reserved : 30;
+
+    MemPoolPolicy(rawAllocType pAlloc_, rawFreeType pFree_,
+                  std::size_t granularity_ = 0, bool fixedPool_ = false,
+                  bool keepAllMemory_ = false) :
+        pAlloc(pAlloc_), pFree(pFree_), granularity(granularity_), version(TBBMALLOC_POOL_VERSION),
+        fixedPool(fixedPool_), keepAllMemory(keepAllMemory_),
+        reserved(0) {}
+};
+
+// enums have same values as appropriate enums from ScalableAllocationResult
+// TODO: use ScalableAllocationResult in pool_create directly
+enum MemPoolError {
+    // pool created successfully
+    POOL_OK = TBBMALLOC_OK,
+    // invalid policy parameters found
+    INVALID_POLICY = TBBMALLOC_INVALID_PARAM,
+     // requested pool policy is not supported by allocator library
+    UNSUPPORTED_POLICY = TBBMALLOC_UNSUPPORTED,
+    // lack of memory during pool creation
+    NO_MEMORY = TBBMALLOC_NO_MEMORY,
+    // action takes no effect
+    NO_EFFECT = TBBMALLOC_NO_EFFECT
+};
+
+MemPoolError pool_create_v1(std::intptr_t pool_id, const MemPoolPolicy *policy,
+                            rml::MemoryPool **pool);
+
+bool  pool_destroy(MemoryPool* memPool);
+void *pool_malloc(MemoryPool* memPool, std::size_t size);
+void *pool_realloc(MemoryPool* memPool, void *object, std::size_t size);
+void *pool_aligned_malloc(MemoryPool* mPool, std::size_t size, std::size_t alignment);
+void *pool_aligned_realloc(MemoryPool* mPool, void *ptr, std::size_t size, std::size_t alignment);
+bool  pool_reset(MemoryPool* memPool);
+bool  pool_free(MemoryPool *memPool, void *object);
+MemoryPool *pool_identify(void *object);
+std::size_t pool_msize(MemoryPool *memPool, void *object);
+
+} // namespace rml
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+// keep throw in a separate function to prevent code bloat
+template<typename E>
+void throw_exception(const E &e) {
+#if TBB_USE_EXCEPTIONS
+    throw e;
+#else
+    suppress_unused_warning(e);
+#endif
+}
+
+template<typename T>
+class scalable_allocator {
+public:
+    using value_type = T;
+    using propagate_on_container_move_assignment = std::true_type;
+
+    //! Always defined for TBB containers
+    using is_always_equal = std::true_type;
+
+    scalable_allocator() = default;
+    template<typename U> scalable_allocator(const scalable_allocator<U>&) noexcept {}
+
+    //! Allocate space for n objects.
+    __TBB_nodiscard T* allocate(std::size_t n) {
+        T* p = static_cast<T*>(scalable_malloc(n * sizeof(value_type)));
+        if (!p) {
+            throw_exception(std::bad_alloc());
+        }
+        return p;
+    }
+
+    //! Free previously allocated block of memory
+    void deallocate(T* p, std::size_t) {
+        scalable_free(p);
+    }
+
+#if TBB_ALLOCATOR_TRAITS_BROKEN
+    using pointer = value_type*;
+    using const_pointer = const value_type*;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+    using difference_type = std::ptrdiff_t;
+    using size_type = std::size_t;
+    template<typename U> struct rebind {
+        using other = scalable_allocator<U>;
+    };
+    //! Largest value for which method allocate might succeed.
+    size_type max_size() const noexcept {
+        size_type absolutemax = static_cast<size_type>(-1) / sizeof (value_type);
+        return (absolutemax > 0 ? absolutemax : 1);
+    }
+    template<typename U, typename... Args>
+    void construct(U *p, Args&&... args)
+        { ::new((void *)p) U(std::forward<Args>(args)...); }
+    void destroy(pointer p) { p->~value_type(); }
+    pointer address(reference x) const { return &x; }
+    const_pointer address(const_reference x) const { return &x; }
+#endif // TBB_ALLOCATOR_TRAITS_BROKEN
+
+};
+
+#if TBB_ALLOCATOR_TRAITS_BROKEN
+    template<>
+    class scalable_allocator<void> {
+    public:
+        using pointer = void*;
+        using const_pointer = const void*;
+        using value_type = void;
+        template<typename U> struct rebind {
+            using other = scalable_allocator<U>;
+        };
+    };
+#endif
+
+template<typename T, typename U>
+inline bool operator==(const scalable_allocator<T>&, const scalable_allocator<U>&) noexcept { return true; }
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+template<typename T, typename U>
+inline bool operator!=(const scalable_allocator<T>&, const scalable_allocator<U>&) noexcept { return false; }
+#endif
+
+#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT
+
+//! C++17 memory resource implementation for scalable allocator
+//! ISO C++ Section 23.12.2
+class scalable_resource_impl : public std::pmr::memory_resource {
+private:
+    void* do_allocate(std::size_t bytes, std::size_t alignment) override {
+        void* p = scalable_aligned_malloc(bytes, alignment);
+        if (!p) {
+            throw_exception(std::bad_alloc());
+        }
+        return p;
+    }
+
+    void do_deallocate(void* ptr, std::size_t /*bytes*/, std::size_t /*alignment*/) override {
+        scalable_free(ptr);
+    }
+
+    //! Memory allocated by one instance of scalable_resource_impl could be deallocated by any
+    //! other instance of this class
+    bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override {
+        return this == &other ||
+#if __TBB_USE_OPTIONAL_RTTI
+            dynamic_cast<const scalable_resource_impl*>(&other) != nullptr;
+#else
+            false;
+#endif
+    }
+};
+
+//! Global scalable allocator memory resource provider
+inline std::pmr::memory_resource* scalable_memory_resource() noexcept {
+    static tbb::detail::d1::scalable_resource_impl scalable_res;
+    return &scalable_res;
+}
+
+#endif // __TBB_CPP17_MEMORY_RESOURCE_PRESENT
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::scalable_allocator;
+#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT
+using detail::d1::scalable_memory_resource;
+#endif
+} // namespace v1
+
+} // namespace tbb
+
+#endif /* __cplusplus */
+
+#endif /* __TBB_scalable_allocator_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/spin_mutex.h b/contrib/libs/tbb/include/oneapi/tbb/spin_mutex.h
new file mode 100644
index 0000000000..7fde7e15af
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/spin_mutex.h
@@ -0,0 +1,179 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_spin_mutex_H
+#define __TBB_spin_mutex_H
+
+#include "detail/_namespace_injection.h"
+
+#include "profiling.h"
+
+#include "detail/_assert.h"
+#include "detail/_utils.h"
+
+#include <atomic>
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+#if __TBB_TSX_INTRINSICS_PRESENT
+class rtm_mutex;
+#endif
+
+/** A spin_mutex is a low-level synchronization primitive.
+    While locked, it causes the waiting threads to spin in a loop until the lock is released.
+    It should be used only for locking short critical sections
+    (typically less than 20 instructions) when fairness is not an issue.
+    If zero-initialized, the mutex is considered unheld.
+    @ingroup synchronization */
+class spin_mutex {
+public:
+    //! Constructors
+    spin_mutex() noexcept : m_flag(false) {
+        create_itt_sync(this, "tbb::spin_mutex", "");
+    };
+
+    //! Destructor
+    ~spin_mutex() = default;
+
+    //! No Copy
+    spin_mutex(const spin_mutex&) = delete;
+    spin_mutex& operator=(const spin_mutex&) = delete;
+
+    //! Represents acquisition of a mutex.
+    class scoped_lock {
+        //! Points to currently held mutex, or NULL if no lock is held.
+        spin_mutex* m_mutex;
+
+    public:
+        //! Construct without acquiring a mutex.
+        constexpr scoped_lock() noexcept : m_mutex(nullptr) {}
+
+        //! Construct and acquire lock on a mutex.
+        scoped_lock(spin_mutex& m) {
+            acquire(m);
+        }
+
+        //! No Copy
+        scoped_lock(const scoped_lock&) = delete;
+        scoped_lock& operator=(const scoped_lock&) = delete;
+
+        //! Acquire lock.
+        void acquire(spin_mutex& m) {
+            m_mutex = &m;
+            m.lock();
+        }
+
+        //! Try acquiring lock (non-blocking)
+        /** Return true if lock acquired; false otherwise. */
+        bool try_acquire(spin_mutex& m) {
+            bool result = m.try_lock();
+            if (result) {
+                m_mutex = &m;
+            }
+            return result;
+        }
+
+        //! Release lock
+        void release() {
+            __TBB_ASSERT(m_mutex, "release on spin_mutex::scoped_lock that is not holding a lock");
+            m_mutex->unlock();
+            m_mutex = nullptr;
+        }
+
+        //! Destroy lock. If holding a lock, releases the lock first.
+        ~scoped_lock() {
+            if (m_mutex) {
+                release();
+            }
+        }
+    };
+
+    //! Mutex traits
+    static constexpr bool is_rw_mutex = false;
+    static constexpr bool is_recursive_mutex = false;
+    static constexpr bool is_fair_mutex = false;
+
+    //! Acquire lock
+    /** Spin if the lock is taken */
+    void lock() {
+        atomic_backoff backoff;
+        call_itt_notify(prepare, this);
+        while (m_flag.exchange(true)) backoff.pause();
+        call_itt_notify(acquired, this);
+    }
+
+    //! Try acquiring lock (non-blocking)
+    /** Return true if lock acquired; false otherwise. */
+    bool try_lock() {
+        bool result = !m_flag.exchange(true);
+        if (result) {
+            call_itt_notify(acquired, this);
+        }
+        return result;
+    }
+
+    //! Release lock
+    void unlock() {
+        call_itt_notify(releasing, this);
+        m_flag.store(false, std::memory_order_release);
+    }
+
+protected:
+    std::atomic<bool> m_flag;
+}; // class spin_mutex
+
+#if TBB_USE_PROFILING_TOOLS
+inline void set_name(spin_mutex& obj, const char* name) {
+    itt_set_sync_name(&obj, name);
+}
+#if (_WIN32||_WIN64) && !__MINGW32__
+inline void set_name(spin_mutex& obj, const wchar_t* name) {
+    itt_set_sync_name(&obj, name);
+}
+#endif //WIN
+#else
+inline void set_name(spin_mutex&, const char*) {}
+#if (_WIN32||_WIN64) && !__MINGW32__
+inline void set_name(spin_mutex&, const wchar_t*) {}
+#endif // WIN
+#endif
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::spin_mutex;
+} // namespace v1
+namespace profiling {
+    using detail::d1::set_name;
+}
+} // namespace tbb
+
+#include "detail/_rtm_mutex.h"
+
+namespace tbb {
+inline namespace v1 {
+#if __TBB_TSX_INTRINSICS_PRESENT
+    using speculative_spin_mutex = detail::d1::rtm_mutex;
+#else
+    using speculative_spin_mutex = detail::d1::spin_mutex;
+#endif
+}
+}
+
+#endif /* __TBB_spin_mutex_H */
+
diff --git a/contrib/libs/tbb/include/oneapi/tbb/spin_rw_mutex.h b/contrib/libs/tbb/include/oneapi/tbb/spin_rw_mutex.h
new file mode 100644
index 0000000000..baf6b24b56
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/spin_rw_mutex.h
@@ -0,0 +1,307 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_spin_rw_mutex_H
+#define __TBB_spin_rw_mutex_H
+
+#include "detail/_namespace_injection.h"
+
+#include "profiling.h"
+
+#include "detail/_assert.h"
+#include "detail/_utils.h"
+
+#include <atomic>
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+#if __TBB_TSX_INTRINSICS_PRESENT
+class rtm_rw_mutex;
+#endif
+
+//! Fast, unfair, spinning reader-writer lock with backoff and writer-preference
+/** @ingroup synchronization */
+class spin_rw_mutex {
+public:
+    //! Constructors
+    spin_rw_mutex() noexcept : m_state(0) {
+       create_itt_sync(this, "tbb::spin_rw_mutex", "");
+    }
+
+    //! Destructor
+    ~spin_rw_mutex() {
+        __TBB_ASSERT(!m_state, "destruction of an acquired mutex");
+    }
+
+    //! No Copy
+    spin_rw_mutex(const spin_rw_mutex&) = delete;
+    spin_rw_mutex& operator=(const spin_rw_mutex&) = delete;
+
+    //! The scoped locking pattern
+    /** It helps to avoid the common problem of forgetting to release lock.
+        It also nicely provides the "node" for queuing locks. */
+    class scoped_lock {
+    public:
+        //! Construct lock that has not acquired a mutex.
+        /** Equivalent to zero-initialization of *this. */
+        constexpr scoped_lock() noexcept : m_mutex(nullptr), m_is_writer(false) {}
+
+        //! Acquire lock on given mutex.
+        scoped_lock(spin_rw_mutex& m, bool write = true) : m_mutex(nullptr) {
+            acquire(m, write);
+        }
+
+        //! Release lock (if lock is held).
+        ~scoped_lock() {
+            if (m_mutex) {
+                release();
+            }
+        }
+
+        //! No Copy
+        scoped_lock(const scoped_lock&) = delete;
+        scoped_lock& operator=(const scoped_lock&) = delete;
+
+        //! Acquire lock on given mutex.
+        void acquire(spin_rw_mutex& m, bool write = true) {
+            m_is_writer = write;
+            m_mutex = &m;
+            if (write) {
+                m_mutex->lock();
+            } else {
+                m_mutex->lock_shared();
+            }
+        }
+
+        //! Try acquire lock on given mutex.
+        bool try_acquire(spin_rw_mutex& m, bool write = true) {
+            m_is_writer = write;
+            bool result = write ? m.try_lock() : m.try_lock_shared();
+            if (result) {
+                m_mutex = &m;
+            }
+            return result;
+        }
+
+        //! Release lock.
+        void release() {
+            spin_rw_mutex* m = m_mutex;
+            m_mutex = nullptr;
+
+            if (m_is_writer) {
+                m->unlock();
+            } else {
+                m->unlock_shared();
+            }
+        }
+
+        //! Upgrade reader to become a writer.
+        /** Returns whether the upgrade happened without releasing and re-acquiring the lock */
+        bool upgrade_to_writer() {
+            if (m_is_writer) return true; // Already a writer
+            m_is_writer = true;
+            return m_mutex->upgrade();
+        }
+
+        //! Downgrade writer to become a reader.
+        bool downgrade_to_reader() {
+            if (!m_is_writer) return true; // Already a reader
+            m_mutex->downgrade();
+            m_is_writer = false;
+            return true;
+        }
+
+    protected:
+        //! The pointer to the current mutex that is held, or nullptr if no mutex is held.
+        spin_rw_mutex* m_mutex;
+
+        //! If mutex != nullptr, then is_writer is true if holding a writer lock, false if holding a reader lock.
+        /** Not defined if not holding a lock. */
+        bool m_is_writer;
+    };
+
+    //! Mutex traits
+    static constexpr bool is_rw_mutex = true;
+    static constexpr bool is_recursive_mutex = false;
+    static constexpr bool is_fair_mutex = false;
+
+    //! Acquire lock
+    void lock() {
+        call_itt_notify(prepare, this);
+        for (atomic_backoff backoff; ; backoff.pause()) {
+            state_type s = m_state.load(std::memory_order_relaxed);
+            if (!(s & BUSY)) { // no readers, no writers
+                if (m_state.compare_exchange_strong(s, WRITER))
+                    break; // successfully stored writer flag
+                backoff.reset(); // we could be very close to complete op.
+            } else if (!(s & WRITER_PENDING)) { // no pending writers
+                m_state |= WRITER_PENDING;
+            }
+        }
+        call_itt_notify(acquired, this);
+    }
+
+    //! Try acquiring lock (non-blocking)
+    /** Return true if lock acquired; false otherwise. */
+    bool try_lock() {
+        // for a writer: only possible to acquire if no active readers or writers
+        state_type s = m_state.load(std::memory_order_relaxed);
+        if (!(s & BUSY)) { // no readers, no writers; mask is 1..1101
+            if (m_state.compare_exchange_strong(s, WRITER)) {
+                call_itt_notify(acquired, this);
+                return true; // successfully stored writer flag
+            }
+        }
+        return false;
+    }
+
+    //! Release lock
+    void unlock() {
+        call_itt_notify(releasing, this);
+        m_state &= READERS;
+    }
+
+    //! Lock shared ownership mutex
+    void lock_shared() {
+        call_itt_notify(prepare, this);
+        for (atomic_backoff b; ; b.pause()) {
+            state_type s = m_state.load(std::memory_order_relaxed);
+            if (!(s & (WRITER | WRITER_PENDING))) { // no writer or write requests
+                state_type prev_state = m_state.fetch_add(ONE_READER);
+                if (!(prev_state & WRITER)) {
+                    break; // successfully stored increased number of readers
+                }
+                // writer got there first, undo the increment
+                m_state -= ONE_READER;
+            }
+        }
+        call_itt_notify(acquired, this);
+        __TBB_ASSERT(m_state & READERS, "invalid state of a read lock: no readers");
+    }
+
+    //! Try lock shared ownership mutex
+    bool try_lock_shared() {
+        // for a reader: acquire if no active or waiting writers
+        state_type s = m_state.load(std::memory_order_relaxed);
+        if (!(s & (WRITER | WRITER_PENDING))) { // no writers
+            state_type prev_state = m_state.fetch_add(ONE_READER);
+            if (!(prev_state & WRITER)) {  // got the lock
+                call_itt_notify(acquired, this);
+                return true; // successfully stored increased number of readers
+            }
+            // writer got there first, undo the increment
+            m_state -= ONE_READER;
+        }
+        return false;
+    }
+
+    //! Unlock shared ownership mutex
+    void unlock_shared() {
+        __TBB_ASSERT(m_state & READERS, "invalid state of a read lock: no readers");
+        call_itt_notify(releasing, this);
+        m_state -= ONE_READER;
+    }
+
+protected:
+    /** Internal non ISO C++ standard API **/
+    //! This API is used through the scoped_lock class
+
+    //! Upgrade reader to become a writer.
+    /** Returns whether the upgrade happened without releasing and re-acquiring the lock */
+    bool upgrade() {
+        state_type s = m_state.load(std::memory_order_relaxed);
+        __TBB_ASSERT(s & READERS, "invalid state before upgrade: no readers ");
+        // Check and set writer-pending flag.
+        // Required conditions: either no pending writers, or we are the only reader
+        // (with multiple readers and pending writer, another upgrade could have been requested)
+        while ((s & READERS) == ONE_READER || !(s & WRITER_PENDING)) {
+            if (m_state.compare_exchange_strong(s, s | WRITER | WRITER_PENDING)) {
+                atomic_backoff backoff;
+                while ((m_state.load(std::memory_order_relaxed) & READERS) != ONE_READER) backoff.pause();
+                __TBB_ASSERT((m_state & (WRITER_PENDING|WRITER)) == (WRITER_PENDING | WRITER), "invalid state when upgrading to writer");
+                // Both new readers and writers are blocked at this time
+                m_state -= (ONE_READER + WRITER_PENDING);
+                return true; // successfully upgraded
+            }
+        }
+        // Slow reacquire
+        unlock_shared();
+        lock();
+        return false;
+    }
+
+    //! Downgrade writer to a reader
+    void downgrade() {
+        call_itt_notify(releasing, this);
+        m_state += (ONE_READER - WRITER);
+        __TBB_ASSERT(m_state & READERS, "invalid state after downgrade: no readers");
+    }
+
+    using state_type = std::intptr_t;
+    static constexpr state_type WRITER = 1;
+    static constexpr state_type WRITER_PENDING = 2;
+    static constexpr state_type READERS = ~(WRITER | WRITER_PENDING);
+    static constexpr state_type ONE_READER = 4;
+    static constexpr state_type BUSY = WRITER | READERS;
+    //! State of lock
+    /** Bit 0 = writer is holding lock
+        Bit 1 = request by a writer to acquire lock (hint to readers to wait)
+        Bit 2..N = number of readers holding lock */
+    std::atomic<state_type> m_state;
+}; // class spin_rw_mutex
+
+#if TBB_USE_PROFILING_TOOLS
+inline void set_name(spin_rw_mutex& obj, const char* name) {
+    itt_set_sync_name(&obj, name);
+}
+#if (_WIN32||_WIN64) && !__MINGW32__
+inline void set_name(spin_rw_mutex& obj, const wchar_t* name) {
+    itt_set_sync_name(&obj, name);
+}
+#endif // WIN
+#else
+inline void set_name(spin_rw_mutex&, const char*) {}
+#if (_WIN32||_WIN64) && !__MINGW32__
+inline void set_name(spin_rw_mutex&, const wchar_t*) {}
+#endif // WIN
+#endif
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::spin_rw_mutex;
+} // namespace v1
+namespace profiling {
+    using detail::d1::set_name;
+}
+} // namespace tbb
+
+#include "detail/_rtm_rw_mutex.h"
+
+namespace tbb {
+inline namespace v1 {
+#if __TBB_TSX_INTRINSICS_PRESENT
+    using speculative_spin_rw_mutex = detail::d1::rtm_rw_mutex;
+#else
+    using speculative_spin_rw_mutex = detail::d1::spin_rw_mutex;
+#endif
+}
+}
+
+#endif /* __TBB_spin_rw_mutex_H */
+
diff --git a/contrib/libs/tbb/include/oneapi/tbb/task.h b/contrib/libs/tbb/include/oneapi/tbb/task.h
new file mode 100644
index 0000000000..82ce1df6cd
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/task.h
@@ -0,0 +1,37 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_task_H
+#define __TBB_task_H
+
+#include "detail/_config.h"
+#include "detail/_namespace_injection.h"
+#include "detail/_task.h"
+
+namespace tbb {
+inline namespace v1 {
+namespace task {
+#if __TBB_RESUMABLE_TASKS
+    using detail::d1::suspend_point;
+    using detail::d1::resume;
+    using detail::d1::suspend;
+#endif /* __TBB_RESUMABLE_TASKS */
+    using detail::d1::current_context;
+} // namespace task
+} // namespace v1
+} // namespace tbb
+
+#endif /* __TBB_task_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/task_arena.h b/contrib/libs/tbb/include/oneapi/tbb/task_arena.h
new file mode 100644
index 0000000000..f1d0f9dea3
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/task_arena.h
@@ -0,0 +1,452 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_task_arena_H
+#define __TBB_task_arena_H
+
+#include "detail/_namespace_injection.h"
+#include "detail/_task.h"
+#include "detail/_exception.h"
+#include "detail/_aligned_space.h"
+#include "detail/_small_object_pool.h"
+
+#if __TBB_ARENA_BINDING
+#include "info.h"
+#endif /*__TBB_ARENA_BINDING*/
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+
+template<typename F, typename R>
+class task_arena_function : public delegate_base {
+    F &my_func;
+    aligned_space<R> my_return_storage;
+    bool my_constructed{false};
+    // The function should be called only once.
+    bool operator()() const override {
+        new (my_return_storage.begin()) R(my_func());
+        return true;
+    }
+public:
+    task_arena_function(F& f) : my_func(f) {}
+    // The function can be called only after operator() and only once.
+    R consume_result() {
+        my_constructed = true;
+        return std::move(*(my_return_storage.begin()));
+    }
+    ~task_arena_function() override {
+        if (my_constructed) {
+            my_return_storage.begin()->~R();
+        }
+    }
+};
+
+template<typename F>
+class task_arena_function<F,void> : public delegate_base {
+    F &my_func;
+    bool operator()() const override {
+        my_func();
+        return true;
+    }
+public:
+    task_arena_function(F& f) : my_func(f) {}
+    void consume_result() const {}
+
+    friend class task_arena_base;
+};
+
+class task_arena_base;
+class task_scheduler_observer;
+} // namespace d1
+
+namespace r1 {
+class arena;
+struct task_arena_impl;
+
+void __TBB_EXPORTED_FUNC observe(d1::task_scheduler_observer&, bool);
+void __TBB_EXPORTED_FUNC initialize(d1::task_arena_base&);
+void __TBB_EXPORTED_FUNC terminate(d1::task_arena_base&);
+bool __TBB_EXPORTED_FUNC attach(d1::task_arena_base&);
+void __TBB_EXPORTED_FUNC execute(d1::task_arena_base&, d1::delegate_base&);
+void __TBB_EXPORTED_FUNC wait(d1::task_arena_base&);
+int __TBB_EXPORTED_FUNC max_concurrency(const d1::task_arena_base*);
+void __TBB_EXPORTED_FUNC isolate_within_arena(d1::delegate_base& d, std::intptr_t);
+
+void __TBB_EXPORTED_FUNC enqueue(d1::task&, d1::task_arena_base*);
+void __TBB_EXPORTED_FUNC submit(d1::task&, d1::task_group_context&, arena*, std::uintptr_t);
+} // namespace r1
+
+namespace d1 {
+
+static constexpr int priority_stride = INT_MAX / 4;
+
+class task_arena_base {
+    friend struct r1::task_arena_impl;
+    friend void r1::observe(d1::task_scheduler_observer&, bool);
+public:
+    enum class priority : int {
+        low    = 1 * priority_stride,
+        normal = 2 * priority_stride,
+        high   = 3 * priority_stride
+    };
+#if __TBB_ARENA_BINDING
+    using constraints = tbb::detail::d1::constraints;
+#endif /*__TBB_ARENA_BINDING*/
+protected:
+    //! Special settings
+    intptr_t my_version_and_traits;
+
+    std::atomic<do_once_state> my_initialization_state;
+
+    //! NULL if not currently initialized.
+    std::atomic<r1::arena*> my_arena;
+    static_assert(sizeof(std::atomic<r1::arena*>) == sizeof(r1::arena*), 
+        "To preserve backward compatibility we need the equal size of an atomic pointer and a pointer");
+
+    //! Concurrency level for deferred initialization
+    int my_max_concurrency;
+
+    //! Reserved slots for external threads
+    unsigned my_num_reserved_slots;
+
+    //! Arena priority
+    priority my_priority;
+
+    //! The NUMA node index to which the arena will be attached
+    numa_node_id my_numa_id;
+
+    //! The core type index to which arena will be attached
+    core_type_id my_core_type;
+
+    //! Number of threads per core
+    int my_max_threads_per_core;
+
+    // Backward compatibility checks.
+    core_type_id core_type() const {
+        return (my_version_and_traits & core_type_support_flag) == core_type_support_flag ? my_core_type : automatic;
+    }
+    int max_threads_per_core() const {
+        return (my_version_and_traits & core_type_support_flag) == core_type_support_flag ? my_max_threads_per_core : automatic;
+    }
+
+    enum {
+        default_flags = 0
+        , core_type_support_flag = 1
+    };
+
+    task_arena_base(int max_concurrency, unsigned reserved_for_masters, priority a_priority)
+        : my_version_and_traits(default_flags | core_type_support_flag)
+        , my_initialization_state(do_once_state::uninitialized)
+        , my_arena(nullptr)
+        , my_max_concurrency(max_concurrency)
+        , my_num_reserved_slots(reserved_for_masters)
+        , my_priority(a_priority)
+        , my_numa_id(automatic)
+        , my_core_type(automatic)
+        , my_max_threads_per_core(automatic)
+        {}
+
+#if __TBB_ARENA_BINDING
+    task_arena_base(const constraints& constraints_, unsigned reserved_for_masters, priority a_priority)
+        : my_version_and_traits(default_flags | core_type_support_flag)
+        , my_initialization_state(do_once_state::uninitialized)
+        , my_arena(nullptr)
+        , my_max_concurrency(constraints_.max_concurrency)
+        , my_num_reserved_slots(reserved_for_masters)
+        , my_priority(a_priority)
+        , my_numa_id(constraints_.numa_id)
+#if __TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION_PRESENT
+        , my_core_type(constraints_.core_type)
+        , my_max_threads_per_core(constraints_.max_threads_per_core)
+#else
+        , my_core_type(automatic)
+        , my_max_threads_per_core(automatic)
+#endif
+        {}
+#endif /*__TBB_ARENA_BINDING*/
+public:
+    //! Typedef for number of threads that is automatic.
+    static const int automatic = -1;
+    static const int not_initialized = -2;
+};
+
+template<typename R, typename F>
+R isolate_impl(F& f) {
+    task_arena_function<F, R> func(f);
+    r1::isolate_within_arena(func, /*isolation*/ 0);
+    return func.consume_result();
+}
+
+/** 1-to-1 proxy representation class of scheduler's arena
+ * Constructors set up settings only, real construction is deferred till the first method invocation
+ * Destructor only removes one of the references to the inner arena representation.
+ * Final destruction happens when all the references (and the work) are gone.
+ */
+class task_arena : public task_arena_base {
+
+    template <typename F>
+    class enqueue_task : public task {
+        small_object_allocator m_allocator;
+        const F m_func;
+
+        void finalize(const execution_data& ed) {
+            m_allocator.delete_object(this, ed);
+        }
+        task* execute(execution_data& ed) override {
+            m_func();
+            finalize(ed);
+            return nullptr;
+        }
+        task* cancel(execution_data&) override {
+            __TBB_ASSERT_RELEASE(false, "Unhandled exception from enqueue task is caught");
+            return nullptr;
+        }
+    public:
+        enqueue_task(const F& f, small_object_allocator& alloc) : m_allocator(alloc), m_func(f) {}
+        enqueue_task(F&& f, small_object_allocator& alloc) : m_allocator(alloc), m_func(std::move(f)) {}
+    };
+
+    void mark_initialized() {
+        __TBB_ASSERT( my_arena.load(std::memory_order_relaxed), "task_arena initialization is incomplete" );
+        my_initialization_state.store(do_once_state::initialized, std::memory_order_release);
+    }
+
+    template<typename F>
+    void enqueue_impl(F&& f) {
+        initialize();
+        small_object_allocator alloc{};
+        r1::enqueue(*alloc.new_object<enqueue_task<typename std::decay<F>::type>>(std::forward<F>(f), alloc), this);
+    }
+
+    template<typename R, typename F>
+    R execute_impl(F& f) {
+        initialize();
+        task_arena_function<F, R> func(f);
+        r1::execute(*this, func);
+        return func.consume_result();
+    }
+public:
+    //! Creates task_arena with certain concurrency limits
+    /** Sets up settings only, real construction is deferred till the first method invocation
+     *  @arg max_concurrency specifies total number of slots in arena where threads work
+     *  @arg reserved_for_masters specifies number of slots to be used by external threads only.
+     *       Value of 1 is default and reflects behavior of implicit arenas.
+     **/
+    task_arena(int max_concurrency_ = automatic, unsigned reserved_for_masters = 1,
+               priority a_priority = priority::normal)
+        : task_arena_base(max_concurrency_, reserved_for_masters, a_priority)
+    {}
+
+#if __TBB_ARENA_BINDING
+    //! Creates task arena pinned to certain NUMA node
+    task_arena(const constraints& constraints_, unsigned reserved_for_masters = 1,
+               priority a_priority = priority::normal)
+        : task_arena_base(constraints_, reserved_for_masters, a_priority)
+    {}
+
+    //! Copies settings from another task_arena
+    task_arena(const task_arena &s) // copy settings but not the reference or instance
+        : task_arena_base(
+            constraints{}
+                .set_numa_id(s.my_numa_id)
+                .set_max_concurrency(s.my_max_concurrency)
+#if __TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION_PRESENT
+                .set_core_type(s.my_core_type)
+                .set_max_threads_per_core(s.my_max_threads_per_core)
+#endif
+            , s.my_num_reserved_slots, s.my_priority)
+    {}
+#else
+    //! Copies settings from another task_arena
+    task_arena(const task_arena& a) // copy settings but not the reference or instance
+        : task_arena_base(a.my_max_concurrency, a.my_num_reserved_slots, a.my_priority)
+    {}
+#endif /*__TBB_ARENA_BINDING*/
+
+    //! Tag class used to indicate the "attaching" constructor
+    struct attach {};
+
+    //! Creates an instance of task_arena attached to the current arena of the thread
+    explicit task_arena( attach )
+        : task_arena_base(automatic, 1, priority::normal) // use default settings if attach fails
+    {
+        if (r1::attach(*this)) {
+            mark_initialized();
+        }
+    }
+
+    //! Forces allocation of the resources for the task_arena as specified in constructor arguments
+    void initialize() {
+        atomic_do_once([this]{ r1::initialize(*this); }, my_initialization_state);
+    }
+
+    //! Overrides concurrency level and forces initialization of internal representation
+    void initialize(int max_concurrency_, unsigned reserved_for_masters = 1,
+                    priority a_priority = priority::normal)
+    {
+        __TBB_ASSERT(!my_arena.load(std::memory_order_relaxed), "Impossible to modify settings of an already initialized task_arena");
+        if( !is_active() ) {
+            my_max_concurrency = max_concurrency_;
+            my_num_reserved_slots = reserved_for_masters;
+            my_priority = a_priority;
+            r1::initialize(*this);
+            mark_initialized();
+        }
+    }
+
+#if __TBB_ARENA_BINDING
+    void initialize(constraints constraints_, unsigned reserved_for_masters = 1,
+                    priority a_priority = priority::normal)
+    {
+        __TBB_ASSERT(!my_arena.load(std::memory_order_relaxed), "Impossible to modify settings of an already initialized task_arena");
+        if( !is_active() ) {
+            my_numa_id = constraints_.numa_id;
+            my_max_concurrency = constraints_.max_concurrency;
+#if __TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION_PRESENT
+            my_core_type = constraints_.core_type;
+            my_max_threads_per_core = constraints_.max_threads_per_core;
+#endif
+            my_num_reserved_slots = reserved_for_masters;
+            my_priority = a_priority;
+            r1::initialize(*this);
+            mark_initialized();
+        }
+    }
+#endif /*__TBB_ARENA_BINDING*/
+
+    //! Attaches this instance to the current arena of the thread
+    void initialize(attach) {
+        // TODO: decide if this call must be thread-safe
+        __TBB_ASSERT(!my_arena.load(std::memory_order_relaxed), "Impossible to modify settings of an already initialized task_arena");
+        if( !is_active() ) {
+            if ( !r1::attach(*this) ) {
+                r1::initialize(*this);
+            }
+            mark_initialized();
+        }
+    }
+
+    //! Removes the reference to the internal arena representation.
+    //! Not thread safe wrt concurrent invocations of other methods.
+    void terminate() {
+        if( is_active() ) {
+            r1::terminate(*this);
+            my_initialization_state.store(do_once_state::uninitialized, std::memory_order_relaxed);
+        }
+    }
+
+    //! Removes the reference to the internal arena representation, and destroys the external object.
+    //! Not thread safe wrt concurrent invocations of other methods.
+    ~task_arena() {
+        terminate();
+    }
+
+    //! Returns true if the arena is active (initialized); false otherwise.
+    //! The name was chosen to match a task_scheduler_init method with the same semantics.
+    bool is_active() const {
+        return my_initialization_state.load(std::memory_order_acquire) == do_once_state::initialized;
+    }
+
+    //! Enqueues a task into the arena to process a functor, and immediately returns.
+    //! Does not require the calling thread to join the arena
+
+    template<typename F>
+    void enqueue(F&& f) {
+        enqueue_impl(std::forward<F>(f));
+    }
+
+    //! Joins the arena and executes a mutable functor, then returns
+    //! If not possible to join, wraps the functor into a task, enqueues it and waits for task completion
+    //! Can decrement the arena demand for workers, causing a worker to leave and free a slot to the calling thread
+    //! Since C++11, the method returns the value returned by functor (prior to C++11 it returns void).
+    template<typename F>
+    auto execute(F&& f) -> decltype(f()) {
+        return execute_impl<decltype(f())>(f);
+    }
+
+#if __TBB_EXTRA_DEBUG
+    //! Returns my_num_reserved_slots
+    int debug_reserved_slots() const {
+        // Handle special cases inside the library
+        return my_num_reserved_slots;
+    }
+
+    //! Returns my_max_concurrency
+    int debug_max_concurrency() const {
+        // Handle special cases inside the library
+        return my_max_concurrency;
+    }
+
+    //! Wait for all work in the arena to be completed
+    //! Even submitted by other application threads
+    //! Joins arena if/when possible (in the same way as execute())
+    void debug_wait_until_empty() {
+        initialize();
+        r1::wait(*this);
+    }
+#endif //__TBB_EXTRA_DEBUG
+
+    //! Returns the maximal number of threads that can work inside the arena
+    int max_concurrency() const {
+        // Handle special cases inside the library
+        return (my_max_concurrency > 1) ? my_max_concurrency : r1::max_concurrency(this);
+    }
+
+    friend void submit(task& t, task_arena& ta, task_group_context& ctx, bool as_critical) {
+        __TBB_ASSERT(ta.is_active(), nullptr);
+        call_itt_task_notify(releasing, &t);
+        r1::submit(t, ctx, ta.my_arena.load(std::memory_order_relaxed), as_critical ? 1 : 0);
+    }
+};
+
+//! Executes a mutable functor in isolation within the current task arena.
+//! Since C++11, the method returns the value returned by functor (prior to C++11 it returns void).
+template<typename F>
+inline auto isolate(F&& f) -> decltype(f()) {
+    return isolate_impl<decltype(f())>(f);
+}
+
+//! Returns the index, aka slot number, of the calling thread in its current arena
+inline int current_thread_index() {
+    int idx = r1::execution_slot(nullptr);
+    return idx == -1 ? task_arena_base::not_initialized : idx;
+}
+
+//! Returns the maximal number of threads that can work inside the arena
+inline int max_concurrency() {
+    return r1::max_concurrency(nullptr);
+}
+
+using r1::submit;
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::task_arena;
+
+namespace this_task_arena {
+using detail::d1::current_thread_index;
+using detail::d1::max_concurrency;
+using detail::d1::isolate;
+} // namespace this_task_arena
+
+} // inline namespace v1
+
+} // namespace tbb
+#endif /* __TBB_task_arena_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/task_group.h b/contrib/libs/tbb/include/oneapi/tbb/task_group.h
new file mode 100644
index 0000000000..e82553076a
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/task_group.h
@@ -0,0 +1,556 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_task_group_H
+#define __TBB_task_group_H
+
+#include "detail/_config.h"
+#include "detail/_namespace_injection.h"
+#include "detail/_template_helpers.h"
+#include "detail/_utils.h"
+#include "detail/_exception.h"
+#include "detail/_task.h"
+#include "detail/_small_object_pool.h"
+
+#include "profiling.h"
+
+#include <functional>
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Suppress warning: structure was padded due to alignment specifier
+    #pragma warning(push)
+    #pragma warning(disable:4324)
+#endif
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+class delegate_base;
+class task_arena_base;
+class task_group_context;
+}
+
+namespace r1 {
+// Forward declarations
+class tbb_exception_ptr;
+class market;
+class thread_data;
+class task_dispatcher;
+template <bool>
+class context_guard_helper;
+struct task_arena_impl;
+
+void __TBB_EXPORTED_FUNC execute(d1::task_arena_base&, d1::delegate_base&);
+void __TBB_EXPORTED_FUNC isolate_within_arena(d1::delegate_base&, std::intptr_t);
+
+void __TBB_EXPORTED_FUNC initialize(d1::task_group_context&);
+void __TBB_EXPORTED_FUNC destroy(d1::task_group_context&);
+void __TBB_EXPORTED_FUNC reset(d1::task_group_context&);
+bool __TBB_EXPORTED_FUNC cancel_group_execution(d1::task_group_context&);
+bool __TBB_EXPORTED_FUNC is_group_execution_cancelled(d1::task_group_context&);
+void __TBB_EXPORTED_FUNC capture_fp_settings(d1::task_group_context&);
+
+struct task_group_context_impl;
+}
+
+namespace d1 {
+
+struct context_list_node {
+    std::atomic<context_list_node*> prev{};
+    std::atomic<context_list_node*> next{};
+
+    void remove_relaxed() {
+        context_list_node* p = prev.load(std::memory_order_relaxed);
+        context_list_node* n = next.load(std::memory_order_relaxed);
+        p->next.store(n, std::memory_order_relaxed);
+        n->prev.store(p, std::memory_order_relaxed);
+    }
+};
+
+//! Used to form groups of tasks
+/** @ingroup task_scheduling
+    The context services explicit cancellation requests from user code, and unhandled
+    exceptions intercepted during tasks execution. Intercepting an exception results
+    in generating internal cancellation requests (which is processed in exactly the
+    same way as external ones).
+
+    The context is associated with one or more root tasks and defines the cancellation
+    group that includes all the descendants of the corresponding root task(s). Association
+    is established when a context object is passed as an argument to the task::allocate_root()
+    method. See task_group_context::task_group_context for more details.
+
+    The context can be bound to another one, and other contexts can be bound to it,
+    forming a tree-like structure: parent -> this -> children. Arrows here designate
+    cancellation propagation direction. If a task in a cancellation group is cancelled
+    all the other tasks in this group and groups bound to it (as children) get cancelled too.
+**/
+class task_group_context : no_copy {
+public:
+    enum traits_type {
+        fp_settings     = 1 << 1,
+        concurrent_wait = 1 << 2,
+        default_traits  = 0
+    };
+    enum kind_type {
+        isolated,
+        bound
+    };
+private:
+    //! Space for platform-specific FPU settings.
+    /** Must only be accessed inside TBB binaries, and never directly in user
+    code or inline methods. */
+    std::uint64_t my_cpu_ctl_env;
+
+    //! Specifies whether cancellation was requested for this task group.
+    std::atomic<std::uint32_t> my_cancellation_requested;
+
+    //! Version for run-time checks and behavioral traits of the context.
+    std::uint8_t my_version;
+
+    //! The context traits.
+    struct context_traits {
+        bool fp_settings        : 1;
+        bool concurrent_wait    : 1;
+        bool bound              : 1;
+    } my_traits;
+
+    static_assert(sizeof(context_traits) == 1, "Traits shall fit into one byte.");
+
+    static constexpr std::uint8_t may_have_children = 1;
+    //! The context internal state (currently only may_have_children).
+    std::atomic<std::uint8_t> my_state;
+
+    enum class lifetime_state : std::uint8_t {
+        created,
+        locked,
+        isolated,
+        bound,
+        detached,
+        dying
+    };
+
+    //! The synchronization machine state to manage lifetime.
+    std::atomic<lifetime_state> my_lifetime_state;
+
+    //! Pointer to the context of the parent cancellation group. NULL for isolated contexts.
+    task_group_context* my_parent;
+
+    //! Thread data instance that registered this context in its list.
+    std::atomic<r1::thread_data*> my_owner;
+
+    //! Used to form the thread specific list of contexts without additional memory allocation.
+    /** A context is included into the list of the current thread when its binding to
+        its parent happens. Any context can be present in the list of one thread only. **/
+    context_list_node my_node;
+
+    //! Pointer to the container storing exception being propagated across this task group.
+    r1::tbb_exception_ptr* my_exception;
+
+    //! Used to set and maintain stack stitching point for Intel Performance Tools.
+    void* my_itt_caller;
+
+    //! Description of algorithm for scheduler based instrumentation.
+    string_resource_index my_name;
+
+    char padding[max_nfs_size
+        - sizeof(std::uint64_t) // my_cpu_ctl_env
+        - sizeof(std::atomic<std::uint32_t>) // my_cancellation_requested
+        - sizeof(std::uint8_t) // my_version
+        - sizeof(context_traits) // my_traits
+        - sizeof(std::atomic<std::uint8_t>) // my_state
+        - sizeof(std::atomic<lifetime_state>) // my_lifetime_state
+        - sizeof(task_group_context*) // my_parent
+        - sizeof(std::atomic<r1::thread_data*>) // my_owner
+        - sizeof(context_list_node) // my_node
+        - sizeof(r1::tbb_exception_ptr*) // my_exception
+        - sizeof(void*) // my_itt_caller
+        - sizeof(string_resource_index) // my_name
+    ];
+
+    task_group_context(context_traits t, string_resource_index name)
+        : my_version{}, my_name{ name } {
+        my_traits = t; // GCC4.8 issues warning list initialization for bitset (missing-field-initializers)
+        r1::initialize(*this);
+    }
+
+    static context_traits make_traits(kind_type relation_with_parent, std::uintptr_t user_traits) {
+        context_traits ct;
+        ct.bound = relation_with_parent == bound;
+        ct.fp_settings = (user_traits & fp_settings) == fp_settings;
+        ct.concurrent_wait = (user_traits & concurrent_wait) == concurrent_wait;
+        return ct;
+    }
+
+public:
+    //! Default & binding constructor.
+    /** By default a bound context is created. That is this context will be bound
+        (as child) to the context of the currently executing task . Cancellation
+        requests passed to the parent context are propagated to all the contexts
+        bound to it. Similarly priority change is propagated from the parent context
+        to its children.
+
+        If task_group_context::isolated is used as the argument, then the tasks associated
+        with this context will never be affected by events in any other context.
+
+        Creating isolated contexts involve much less overhead, but they have limited
+        utility. Normally when an exception occurs in an algorithm that has nested
+        ones running, it is desirably to have all the nested algorithms cancelled
+        as well. Such a behavior requires nested algorithms to use bound contexts.
+
+        There is one good place where using isolated algorithms is beneficial. It is
+        an external thread. That is if a particular algorithm is invoked directly from
+        the external thread (not from a TBB task), supplying it with explicitly
+        created isolated context will result in a faster algorithm startup.
+
+        VERSIONING NOTE:
+        Implementation(s) of task_group_context constructor(s) cannot be made
+        entirely out-of-line because the run-time version must be set by the user
+        code. This will become critically important for binary compatibility, if
+        we ever have to change the size of the context object. **/
+
+    task_group_context(kind_type relation_with_parent = bound,
+                       std::uintptr_t t = default_traits)
+        : task_group_context(make_traits(relation_with_parent, t), CUSTOM_CTX) {}
+
+    // Custom constructor for instrumentation of oneTBB algorithm
+    task_group_context (string_resource_index name )
+        : task_group_context(make_traits(bound, default_traits), name) {}
+
+    // Do not introduce any logic on user side since it might break state propagation assumptions
+    ~task_group_context() {
+        r1::destroy(*this);
+    }
+
+    //! Forcefully reinitializes the context after the task tree it was associated with is completed.
+    /** Because the method assumes that all the tasks that used to be associated with
+        this context have already finished, calling it while the context is still
+        in use somewhere in the task hierarchy leads to undefined behavior.
+
+        IMPORTANT: This method is not thread safe!
+
+        The method does not change the context's parent if it is set. **/
+    void reset() {
+        r1::reset(*this);
+    }
+
+    //! Initiates cancellation of all tasks in this cancellation group and its subordinate groups.
+    /** \return false if cancellation has already been requested, true otherwise.
+
+        Note that canceling never fails. When false is returned, it just means that
+        another thread (or this one) has already sent cancellation request to this
+        context or to one of its ancestors (if this context is bound). It is guaranteed
+        that when this method is concurrently called on the same not yet cancelled
+        context, true will be returned by one and only one invocation. **/
+    bool cancel_group_execution() {
+        return r1::cancel_group_execution(*this);
+    }
+
+    //! Returns true if the context received cancellation request.
+    bool is_group_execution_cancelled() {
+        return r1::is_group_execution_cancelled(*this);
+    }
+
+#if __TBB_FP_CONTEXT
+    //! Captures the current FPU control settings to the context.
+    /** Because the method assumes that all the tasks that used to be associated with
+        this context have already finished, calling it while the context is still
+        in use somewhere in the task hierarchy leads to undefined behavior.
+
+        IMPORTANT: This method is not thread safe!
+
+        The method does not change the FPU control settings of the context's parent. **/
+    void capture_fp_settings() {
+        r1::capture_fp_settings(*this);
+    }
+#endif
+
+    //! Returns the user visible context trait
+    std::uintptr_t traits() const {
+        std::uintptr_t t{};
+        t |= my_traits.fp_settings ? fp_settings : 0;
+        t |= my_traits.concurrent_wait ? concurrent_wait : 0;
+        return t;
+    }
+private:
+    //// TODO: cleanup friends
+    friend class r1::market;
+    friend class r1::thread_data;
+    friend class r1::task_dispatcher;
+    template <bool>
+    friend class r1::context_guard_helper;
+    friend struct r1::task_arena_impl;
+    friend struct r1::task_group_context_impl;
+}; // class task_group_context
+
+static_assert(sizeof(task_group_context) == 128, "Wrong size of task_group_context");
+
+enum task_group_status {
+    not_complete,
+    complete,
+    canceled
+};
+
+class task_group;
+class structured_task_group;
+#if TBB_PREVIEW_ISOLATED_TASK_GROUP
+class isolated_task_group;
+#endif
+
+template<typename F>
+class function_task : public task {
+    const F m_func;
+    wait_context& m_wait_ctx;
+    small_object_allocator m_allocator;
+
+    void finalize(const execution_data& ed) {
+        // Make a local reference not to access this after destruction.
+        wait_context& wo = m_wait_ctx;
+        // Copy allocator to the stack
+        auto allocator = m_allocator;
+        // Destroy user functor before release wait.
+        this->~function_task();
+        wo.release();
+
+        allocator.deallocate(this, ed);
+    }
+    task* execute(execution_data& ed) override {
+        m_func();
+        finalize(ed);
+        return nullptr;
+    }
+    task* cancel(execution_data& ed) override {
+        finalize(ed);
+        return nullptr;
+    }
+public:
+    function_task(const F& f, wait_context& wo, small_object_allocator& alloc)
+        : m_func(f)
+        , m_wait_ctx(wo)
+        , m_allocator(alloc) {}
+
+    function_task(F&& f, wait_context& wo, small_object_allocator& alloc)
+        : m_func(std::move(f))
+        , m_wait_ctx(wo)
+        , m_allocator(alloc) {}
+};
+
+template <typename F>
+class function_stack_task : public task {
+    const F& m_func;
+    wait_context& m_wait_ctx;
+
+    void finalize() {
+        m_wait_ctx.release();
+    }
+    task* execute(execution_data&) override {
+        m_func();
+        finalize();
+        return nullptr;
+    }
+    task* cancel(execution_data&) override {
+        finalize();
+        return nullptr;
+    }
+public:
+    function_stack_task(const F& f, wait_context& wo) : m_func(f), m_wait_ctx(wo) {}
+};
+
+class task_group_base : no_copy {
+protected:
+    wait_context m_wait_ctx;
+    task_group_context m_context;
+
+    template<typename F>
+    task_group_status internal_run_and_wait(const F& f) {
+        function_stack_task<F> t{ f, m_wait_ctx };
+        m_wait_ctx.reserve();
+        bool cancellation_status = false;
+        try_call([&] {
+            execute_and_wait(t, m_context, m_wait_ctx, m_context);
+        }).on_completion([&] {
+            // TODO: the reset method is not thread-safe. Ensure the correct behavior.
+            cancellation_status = m_context.is_group_execution_cancelled();
+            m_context.reset();
+        });
+        return cancellation_status ? canceled : complete;
+    }
+
+    template<typename F>
+    task* prepare_task(F&& f) {
+        m_wait_ctx.reserve();
+        small_object_allocator alloc{};
+        return alloc.new_object<function_task<typename std::decay<F>::type>>(std::forward<F>(f), m_wait_ctx, alloc);
+    }
+
+public:
+    task_group_base(uintptr_t traits = 0)
+        : m_wait_ctx(0)
+        , m_context(task_group_context::bound, task_group_context::default_traits | traits)
+    {
+    }
+
+    ~task_group_base() noexcept(false) {
+        if (m_wait_ctx.continue_execution()) {
+#if __TBB_CPP17_UNCAUGHT_EXCEPTIONS_PRESENT
+            bool stack_unwinding_in_progress = std::uncaught_exceptions() > 0;
+#else
+            bool stack_unwinding_in_progress = std::uncaught_exception();
+#endif
+            // Always attempt to do proper cleanup to avoid inevitable memory corruption
+            // in case of missing wait (for the sake of better testability & debuggability)
+            if (!m_context.is_group_execution_cancelled())
+                cancel();
+            d1::wait(m_wait_ctx, m_context);
+            if (!stack_unwinding_in_progress)
+                throw_exception(exception_id::missing_wait);
+        }
+    }
+
+    task_group_status wait() {
+        bool cancellation_status = false;
+        try_call([&] {
+            d1::wait(m_wait_ctx, m_context);
+        }).on_completion([&] {
+            // TODO: the reset method is not thread-safe. Ensure the correct behavior.
+            cancellation_status = m_context.is_group_execution_cancelled();
+            m_context.reset();
+        });
+        return cancellation_status ? canceled : complete;
+    }
+
+    void cancel() {
+        m_context.cancel_group_execution();
+    }
+}; // class task_group_base
+
+class task_group : public task_group_base {
+public:
+    task_group() : task_group_base(task_group_context::concurrent_wait) {}
+
+    template<typename F>
+    void run(F&& f) {
+        spawn(*prepare_task(std::forward<F>(f)), m_context);
+    }
+
+    template<typename F>
+    task_group_status run_and_wait(const F& f) {
+        return internal_run_and_wait(f);
+    }
+}; // class task_group
+
+#if TBB_PREVIEW_ISOLATED_TASK_GROUP
+class spawn_delegate : public delegate_base {
+    task* task_to_spawn;
+    task_group_context& context;
+    bool operator()() const override {
+        spawn(*task_to_spawn, context);
+        return true;
+    }
+public:
+    spawn_delegate(task* a_task, task_group_context& ctx)
+        : task_to_spawn(a_task), context(ctx)
+    {}
+};
+
+class wait_delegate : public delegate_base {
+    bool operator()() const override {
+        status = tg.wait();
+        return true;
+    }
+protected:
+    task_group& tg;
+    task_group_status& status;
+public:
+    wait_delegate(task_group& a_group, task_group_status& tgs)
+        : tg(a_group), status(tgs) {}
+};
+
+template<typename F>
+class run_wait_delegate : public wait_delegate {
+    F& func;
+    bool operator()() const override {
+        status = tg.run_and_wait(func);
+        return true;
+    }
+public:
+    run_wait_delegate(task_group& a_group, F& a_func, task_group_status& tgs)
+        : wait_delegate(a_group, tgs), func(a_func) {}
+};
+
+class isolated_task_group : public task_group {
+    intptr_t this_isolation() {
+        return reinterpret_cast<intptr_t>(this);
+    }
+public:
+    isolated_task_group () : task_group() {}
+
+    template<typename F>
+    void run(F&& f) {
+        spawn_delegate sd(prepare_task(std::forward<F>(f)), m_context);
+        r1::isolate_within_arena(sd, this_isolation());
+    }
+
+    template<typename F>
+    task_group_status run_and_wait( const F& f ) {
+        task_group_status result = not_complete;
+        run_wait_delegate<const F> rwd(*this, f, result);
+        r1::isolate_within_arena(rwd, this_isolation());
+        __TBB_ASSERT(result != not_complete, "premature exit from wait?");
+        return result;
+    }
+
+    task_group_status wait() {
+        task_group_status result = not_complete;
+        wait_delegate wd(*this, result);
+        r1::isolate_within_arena(wd, this_isolation());
+        __TBB_ASSERT(result != not_complete, "premature exit from wait?");
+        return result;
+    }
+}; // class isolated_task_group
+#endif // TBB_PREVIEW_ISOLATED_TASK_GROUP
+
+inline bool is_current_task_group_canceling() {
+    task_group_context* ctx = current_context();
+    return ctx ? ctx->is_group_execution_cancelled() : false;
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::task_group_context;
+using detail::d1::task_group;
+#if TBB_PREVIEW_ISOLATED_TASK_GROUP
+using detail::d1::isolated_task_group;
+#endif
+
+using detail::d1::task_group_status;
+using detail::d1::not_complete;
+using detail::d1::complete;
+using detail::d1::canceled;
+
+using detail::d1::is_current_task_group_canceling;
+using detail::r1::missing_wait;
+}
+
+} // namespace tbb
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    #pragma warning(pop) // 4324 warning
+#endif
+
+#endif // __TBB_task_group_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/task_scheduler_observer.h b/contrib/libs/tbb/include/oneapi/tbb/task_scheduler_observer.h
new file mode 100644
index 0000000000..276ca70707
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/task_scheduler_observer.h
@@ -0,0 +1,116 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_task_scheduler_observer_H
+#define __TBB_task_scheduler_observer_H
+
+#include "detail/_namespace_injection.h"
+#include "task_arena.h"
+#include <atomic>
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+class task_scheduler_observer;
+}
+
+namespace r1 {
+class observer_proxy;
+class observer_list;
+
+//! Enable or disable observation
+/** For local observers the method can be used only when the current thread
+has the task scheduler initialized or is attached to an arena.
+Repeated calls with the same state are no-ops. **/
+void __TBB_EXPORTED_FUNC observe(d1::task_scheduler_observer&, bool state = true);
+}
+
+namespace d1 {
+class task_scheduler_observer {
+    friend class r1::observer_proxy;
+    friend class r1::observer_list;
+    friend void r1::observe(d1::task_scheduler_observer&, bool);
+
+    //! Pointer to the proxy holding this observer.
+    /** Observers are proxied by the scheduler to maintain persistent lists of them. **/
+    std::atomic<r1::observer_proxy*> my_proxy{ nullptr };
+
+    //! Counter preventing the observer from being destroyed while in use by the scheduler.
+    /** Valid only when observation is on. **/
+    std::atomic<intptr_t> my_busy_count{ 0 };
+
+    //! Contains task_arena pointer
+    task_arena* my_task_arena{ nullptr };
+public:
+    //! Returns true if observation is enabled, false otherwise.
+    bool is_observing() const { return my_proxy.load(std::memory_order_relaxed) != nullptr; }
+
+    //! Entry notification
+    /** Invoked from inside observe(true) call and whenever a worker enters the arena
+        this observer is associated with. If a thread is already in the arena when
+        the observer is activated, the entry notification is called before it
+        executes the first stolen task. **/
+    virtual void on_scheduler_entry( bool /*is_worker*/ ) {}
+
+    //! Exit notification
+    /** Invoked from inside observe(false) call and whenever a worker leaves the
+        arena this observer is associated with. **/
+    virtual void on_scheduler_exit( bool /*is_worker*/ ) {}
+
+    //! Construct local or global observer in inactive state (observation disabled).
+    /** For a local observer entry/exit notifications are invoked whenever a worker
+        thread joins/leaves the arena of the observer's owner thread. If a thread is
+        already in the arena when the observer is activated, the entry notification is
+        called before it executes the first stolen task. **/
+    explicit task_scheduler_observer() = default;
+
+    //! Construct local observer for a given arena in inactive state (observation disabled).
+    /** entry/exit notifications are invoked whenever a thread joins/leaves arena.
+        If a thread is already in the arena when the observer is activated, the entry notification
+        is called before it executes the first stolen task. **/
+    explicit task_scheduler_observer(task_arena& a) : my_task_arena(&a) {}
+
+    /** Destructor protects instance of the observer from concurrent notification.
+       It is recommended to disable observation before destructor of a derived class starts,
+       otherwise it can lead to concurrent notification callback on partly destroyed object **/
+    virtual ~task_scheduler_observer() {
+        if (my_proxy.load(std::memory_order_relaxed)) {
+            observe(false);
+        }
+    }
+
+    //! Enable or disable observation
+    /** Warning: concurrent invocations of this method are not safe.
+        Repeated calls with the same state are no-ops. **/
+    void observe(bool state = true) {
+        if( state && !my_proxy.load(std::memory_order_relaxed) ) {
+            __TBB_ASSERT( my_busy_count.load(std::memory_order_relaxed) == 0, "Inconsistent state of task_scheduler_observer instance");
+        }
+        r1::observe(*this, state);
+    }
+};
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+    using detail::d1::task_scheduler_observer;
+}
+} // namespace tbb
+
+
+#endif /* __TBB_task_scheduler_observer_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/tbb_allocator.h b/contrib/libs/tbb/include/oneapi/tbb/tbb_allocator.h
new file mode 100644
index 0000000000..3da61a009d
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/tbb_allocator.h
@@ -0,0 +1,126 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_tbb_allocator_H
+#define __TBB_tbb_allocator_H
+
+#include "oneapi/tbb/detail/_utils.h"
+#include "detail/_namespace_injection.h"
+#include <cstdlib>
+#include <utility>
+
+#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT
+#error #include <memory_resource>
+#endif
+
+namespace tbb {
+namespace detail {
+
+namespace r1 {
+void* __TBB_EXPORTED_FUNC allocate_memory(std::size_t size);
+void  __TBB_EXPORTED_FUNC deallocate_memory(void* p);
+bool  __TBB_EXPORTED_FUNC is_tbbmalloc_used();
+}
+
+namespace d1 {
+
+template<typename T>
+class tbb_allocator {
+public:
+    using value_type = T;
+    using propagate_on_container_move_assignment = std::true_type;
+
+    //! Always defined for TBB containers (supported since C++17 for std containers)
+    using is_always_equal = std::true_type;
+
+    //! Specifies current allocator
+    enum malloc_type {
+        scalable,
+        standard
+    };
+
+    tbb_allocator() = default;
+    template<typename U> tbb_allocator(const tbb_allocator<U>&) noexcept {}
+
+    //! Allocate space for n objects.
+    __TBB_nodiscard T* allocate(std::size_t n) {
+        return static_cast<T*>(r1::allocate_memory(n * sizeof(value_type)));
+    }
+
+    //! Free previously allocated block of memory.
+    void deallocate(T* p, std::size_t) {
+        r1::deallocate_memory(p);
+    }
+
+    //! Returns current allocator
+    static malloc_type allocator_type() {
+        return r1::is_tbbmalloc_used() ? standard : scalable;
+    }
+
+#if TBB_ALLOCATOR_TRAITS_BROKEN
+    using pointer = value_type*;
+    using const_pointer = const value_type*;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+    using difference_type = std::ptrdiff_t;
+    using size_type = std::size_t;
+    template<typename U> struct rebind {
+        using other = tbb_allocator<U>;
+    };
+    //! Largest value for which method allocate might succeed.
+    size_type max_size() const noexcept {
+        size_type max = ~(std::size_t(0)) / sizeof(value_type);
+        return (max > 0 ? max : 1);
+    }
+    template<typename U, typename... Args>
+    void construct(U *p, Args&&... args)
+        { ::new (p) U(std::forward<Args>(args)...); }
+    void destroy( pointer p ) { p->~value_type(); }
+    pointer address(reference x) const { return &x; }
+    const_pointer address(const_reference x) const { return &x; }
+#endif // TBB_ALLOCATOR_TRAITS_BROKEN
+};
+
+#if TBB_ALLOCATOR_TRAITS_BROKEN
+    template<>
+    class tbb_allocator<void> {
+    public:
+        using pointer = void*;
+        using const_pointer = const void*;
+        using value_type = void;
+        template<typename U> struct rebind {
+            using other = tbb_allocator<U>;
+        };
+    };
+#endif
+
+template<typename T, typename U>
+inline bool operator==(const tbb_allocator<T>&, const tbb_allocator<U>&) noexcept { return true; }
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+template<typename T, typename U>
+inline bool operator!=(const tbb_allocator<T>&, const tbb_allocator<U>&) noexcept { return false; }
+#endif
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::tbb_allocator;
+} // namespace v1
+} // namespace tbb
+
+#endif /* __TBB_tbb_allocator_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/tbbmalloc_proxy.h b/contrib/libs/tbb/include/oneapi/tbb/tbbmalloc_proxy.h
new file mode 100644
index 0000000000..0ba38f215e
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/tbbmalloc_proxy.h
@@ -0,0 +1,65 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+/*
+Replacing the standard memory allocation routines in Microsoft* C/C++ RTL
+(malloc/free, global new/delete, etc.) with the TBB memory allocator.
+
+Include the following header to a source of any binary which is loaded during
+application startup
+
+#include "oneapi/tbb/tbbmalloc_proxy.h"
+
+or add following parameters to the linker options for the binary which is
+loaded during application startup. It can be either exe-file or dll.
+
+For win32
+tbbmalloc_proxy.lib /INCLUDE:"___TBB_malloc_proxy"
+win64
+tbbmalloc_proxy.lib /INCLUDE:"__TBB_malloc_proxy"
+*/
+
+#ifndef __TBB_tbbmalloc_proxy_H
+#define __TBB_tbbmalloc_proxy_H
+
+#if _MSC_VER
+
+#ifdef _DEBUG
+    #pragma comment(lib, "tbbmalloc_proxy_debug.lib")
+#else
+    #pragma comment(lib, "tbbmalloc_proxy.lib")
+#endif
+
+#if defined(_WIN64)
+    #pragma comment(linker, "/include:__TBB_malloc_proxy")
+#else
+    #pragma comment(linker, "/include:___TBB_malloc_proxy")
+#endif
+
+#else
+/* Primarily to support MinGW */
+
+extern "C" void __TBB_malloc_proxy();
+struct __TBB_malloc_proxy_caller {
+    __TBB_malloc_proxy_caller() { __TBB_malloc_proxy(); }
+} volatile __TBB_malloc_proxy_helper_object;
+
+#endif // _MSC_VER
+
+/* Public Windows API */
+extern "C" int TBB_malloc_replacement_log(char *** function_replacement_log_ptr);
+
+#endif //__TBB_tbbmalloc_proxy_H
diff --git a/contrib/libs/tbb/include/oneapi/tbb/tick_count.h b/contrib/libs/tbb/include/oneapi/tbb/tick_count.h
new file mode 100644
index 0000000000..2caa56ba18
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/tick_count.h
@@ -0,0 +1,99 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_tick_count_H
+#define __TBB_tick_count_H
+
+#include <chrono>
+
+#include "detail/_namespace_injection.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+
+//! Absolute timestamp
+/** @ingroup timing */
+class tick_count {
+public:
+    using clock_type = typename std::conditional<std::chrono::high_resolution_clock::is_steady,
+        std::chrono::high_resolution_clock, std::chrono::steady_clock>::type;
+
+    //! Relative time interval.
+    class interval_t : public clock_type::duration {
+    public:
+        //! Construct a time interval representing zero time duration
+        interval_t() : clock_type::duration(clock_type::duration::zero()) {}
+
+        //! Construct a time interval representing sec seconds time duration
+        explicit interval_t( double sec )
+            : clock_type::duration(std::chrono::duration_cast<clock_type::duration>(std::chrono::duration<double>(sec))) {}
+
+        //! Return the length of a time interval in seconds
+        double seconds() const {
+            return std::chrono::duration_cast<std::chrono::duration<double>>(*this).count();
+        }
+
+        //! Extract the intervals from the tick_counts and subtract them.
+        friend interval_t operator-( const tick_count& t1, const tick_count& t0 );
+
+        //! Add two intervals.
+        friend interval_t operator+( const interval_t& i, const interval_t& j ) {
+            return interval_t(std::chrono::operator+(i, j));
+        }
+
+        //! Subtract two intervals.
+        friend interval_t operator-( const interval_t& i, const interval_t& j ) {
+            return interval_t(std::chrono::operator-(i, j));
+        }
+
+    private:
+        explicit interval_t( clock_type::duration value_ ) : clock_type::duration(value_) {}
+    };
+
+    tick_count() = default;
+
+    //! Return current time.
+    static tick_count now() {
+        return clock_type::now();
+    }
+
+    //! Subtract two timestamps to get the time interval between
+    friend interval_t operator-( const tick_count& t1, const tick_count& t0 ) {
+        return tick_count::interval_t(t1.my_time_point - t0.my_time_point);
+    }
+
+    //! Return the resolution of the clock in seconds per tick.
+    static double resolution() {
+        return static_cast<double>(interval_t::period::num) / interval_t::period::den;
+    }
+
+private:
+    clock_type::time_point my_time_point;
+    tick_count( clock_type::time_point tp ) : my_time_point(tp) {}
+};
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+    using detail::d1::tick_count;
+} // namespace v1
+
+} // namespace tbb
+
+#endif /* __TBB_tick_count_H */
diff --git a/contrib/libs/tbb/include/oneapi/tbb/version.h b/contrib/libs/tbb/include/oneapi/tbb/version.h
new file mode 100644
index 0000000000..1e3507cd9b
--- /dev/null
+++ b/contrib/libs/tbb/include/oneapi/tbb/version.h
@@ -0,0 +1,108 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_version_H
+#define __TBB_version_H
+
+#include "detail/_config.h"
+#include "detail/_namespace_injection.h"
+
+// Product version
+#define TBB_VERSION_MAJOR 2021
+// Update version
+#define TBB_VERSION_MINOR 2
+// "Patch" version for custom releases
+#define TBB_VERSION_PATCH 0
+// Suffix string
+#define __TBB_VERSION_SUFFIX ""
+// Full official version string
+#define TBB_VERSION_STRING __TBB_STRING(TBB_VERSION_MAJOR) "." __TBB_STRING(TBB_VERSION_MINOR) __TBB_VERSION_SUFFIX
+
+// OneAPI oneTBB specification version
+#define ONETBB_SPEC_VERSION "1.0"
+// Full interface version
+#define TBB_INTERFACE_VERSION 12020
+// Major interface version
+#define TBB_INTERFACE_VERSION_MAJOR (TBB_INTERFACE_VERSION/1000)
+// Minor interface version
+#define TBB_INTERFACE_VERSION_MINOR (TBB_INTERFACE_VERSION%1000/10)
+
+// The binary compatibility version
+// To be used in SONAME, manifests, etc.
+#define __TBB_BINARY_VERSION 12
+
+//! TBB_VERSION support
+#ifndef ENDL
+#define ENDL "\n"
+#endif
+
+//TBB_REVAMP_TODO: consider enabling version_string.ver generation
+//TBB_REVAMP_TODO: #include "version_string.ver"
+
+#define __TBB_ONETBB_SPEC_VERSION(N) #N ": SPECIFICATION VERSION\t" ONETBB_SPEC_VERSION ENDL
+#define __TBB_VERSION_NUMBER(N) #N ": VERSION\t\t" TBB_VERSION_STRING ENDL
+#define __TBB_INTERFACE_VERSION_NUMBER(N) #N ": INTERFACE VERSION\t" __TBB_STRING(TBB_INTERFACE_VERSION) ENDL
+
+#ifndef TBB_USE_DEBUG
+    #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\tundefined" ENDL
+#elif TBB_USE_DEBUG==0
+    #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t0" ENDL
+#elif TBB_USE_DEBUG==1
+    #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t1" ENDL
+#elif TBB_USE_DEBUG==2
+    #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t2" ENDL
+#else
+    #error Unexpected value for TBB_USE_DEBUG
+#endif
+
+#ifndef TBB_USE_ASSERT
+    #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\tundefined" ENDL
+#elif TBB_USE_ASSERT==0
+    #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t0" ENDL
+#elif TBB_USE_ASSERT==1
+    #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t1" ENDL
+#elif TBB_USE_ASSERT==2
+    #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t2" ENDL
+#else
+    #error Unexpected value for TBB_USE_ASSERT
+#endif
+
+#define TBB_VERSION_STRINGS_P(N)                \
+    __TBB_ONETBB_SPEC_VERSION(N)                \
+    __TBB_VERSION_NUMBER(N)                     \
+    __TBB_INTERFACE_VERSION_NUMBER(N)           \
+    __TBB_VERSION_USE_DEBUG(N)                  \
+    __TBB_VERSION_USE_ASSERT(N)
+
+#define TBB_VERSION_STRINGS TBB_VERSION_STRINGS_P(oneTBB)
+#define TBBMALLOC_VERSION_STRINGS TBB_VERSION_STRINGS_P(TBBmalloc)
+
+//! The function returns the version string for the Intel(R) oneAPI Threading Building Blocks (oneTBB)
+//! shared library being used.
+/**
+ * The returned pointer is an address of a string in the shared library.
+ * It can be different than the TBB_VERSION_STRING obtained at compile time.
+ */
+extern "C" const char* __TBB_EXPORTED_FUNC TBB_runtime_version();
+
+//! The function returns the interface version of the oneTBB shared library being used.
+/**
+ * The returned version is determined at runtime, not at compile/link time.
+ * It can be different than the value of TBB_INTERFACE_VERSION obtained at compile time.
+ */
+extern "C" int __TBB_EXPORTED_FUNC TBB_runtime_interface_version();
+
+#endif // __TBB_version_H
diff --git a/contrib/libs/tbb/include/tbb/blocked_range.h b/contrib/libs/tbb/include/tbb/blocked_range.h
new file mode 100644
index 0000000000..316ec01ba9
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/blocked_range.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/blocked_range.h"
diff --git a/contrib/libs/tbb/include/tbb/blocked_range2d.h b/contrib/libs/tbb/include/tbb/blocked_range2d.h
new file mode 100644
index 0000000000..1e13240787
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/blocked_range2d.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/blocked_range2d.h"
diff --git a/contrib/libs/tbb/include/tbb/blocked_range3d.h b/contrib/libs/tbb/include/tbb/blocked_range3d.h
new file mode 100644
index 0000000000..3321979660
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/blocked_range3d.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/blocked_range3d.h"
diff --git a/contrib/libs/tbb/include/tbb/blocked_rangeNd.h b/contrib/libs/tbb/include/tbb/blocked_rangeNd.h
new file mode 100644
index 0000000000..0c0fb7303a
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/blocked_rangeNd.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2017-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/blocked_rangeNd.h"
diff --git a/contrib/libs/tbb/include/tbb/cache_aligned_allocator.h b/contrib/libs/tbb/include/tbb/cache_aligned_allocator.h
new file mode 100644
index 0000000000..2d3c66a74a
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/cache_aligned_allocator.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/cache_aligned_allocator.h"
diff --git a/contrib/libs/tbb/include/tbb/combinable.h b/contrib/libs/tbb/include/tbb/combinable.h
new file mode 100644
index 0000000000..50295ec72a
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/combinable.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/combinable.h"
diff --git a/contrib/libs/tbb/include/tbb/concurrent_hash_map.h b/contrib/libs/tbb/include/tbb/concurrent_hash_map.h
new file mode 100644
index 0000000000..68652c5961
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/concurrent_hash_map.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/concurrent_hash_map.h"
diff --git a/contrib/libs/tbb/include/tbb/concurrent_lru_cache.h b/contrib/libs/tbb/include/tbb/concurrent_lru_cache.h
new file mode 100644
index 0000000000..2757a234be
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/concurrent_lru_cache.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/concurrent_lru_cache.h"
diff --git a/contrib/libs/tbb/include/tbb/concurrent_map.h b/contrib/libs/tbb/include/tbb/concurrent_map.h
new file mode 100644
index 0000000000..84f59d7e66
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/concurrent_map.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2019-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/concurrent_map.h"
diff --git a/contrib/libs/tbb/include/tbb/concurrent_priority_queue.h b/contrib/libs/tbb/include/tbb/concurrent_priority_queue.h
new file mode 100644
index 0000000000..3b27130b1e
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/concurrent_priority_queue.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/concurrent_priority_queue.h"
diff --git a/contrib/libs/tbb/include/tbb/concurrent_queue.h b/contrib/libs/tbb/include/tbb/concurrent_queue.h
new file mode 100644
index 0000000000..d81a58b887
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/concurrent_queue.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/concurrent_queue.h"
diff --git a/contrib/libs/tbb/include/tbb/concurrent_set.h b/contrib/libs/tbb/include/tbb/concurrent_set.h
new file mode 100644
index 0000000000..cf4652f597
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/concurrent_set.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2019-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/concurrent_set.h"
diff --git a/contrib/libs/tbb/include/tbb/concurrent_unordered_map.h b/contrib/libs/tbb/include/tbb/concurrent_unordered_map.h
new file mode 100644
index 0000000000..9475c06cf3
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/concurrent_unordered_map.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/concurrent_unordered_map.h"
diff --git a/contrib/libs/tbb/include/tbb/concurrent_unordered_set.h b/contrib/libs/tbb/include/tbb/concurrent_unordered_set.h
new file mode 100644
index 0000000000..81a8f9c37d
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/concurrent_unordered_set.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/concurrent_unordered_set.h"
diff --git a/contrib/libs/tbb/include/tbb/concurrent_vector.h b/contrib/libs/tbb/include/tbb/concurrent_vector.h
new file mode 100644
index 0000000000..c1fc97c623
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/concurrent_vector.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/concurrent_vector.h"
diff --git a/contrib/libs/tbb/include/tbb/enumerable_thread_specific.h b/contrib/libs/tbb/include/tbb/enumerable_thread_specific.h
new file mode 100644
index 0000000000..9d6050d64f
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/enumerable_thread_specific.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/enumerable_thread_specific.h"
diff --git a/contrib/libs/tbb/include/tbb/flow_graph.h b/contrib/libs/tbb/include/tbb/flow_graph.h
new file mode 100644
index 0000000000..40da468fe0
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/flow_graph.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/flow_graph.h"
diff --git a/contrib/libs/tbb/include/tbb/flow_graph_abstractions.h b/contrib/libs/tbb/include/tbb/flow_graph_abstractions.h
new file mode 100644
index 0000000000..cd9dc2967e
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/flow_graph_abstractions.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/flow_graph_abstractions.h"
diff --git a/contrib/libs/tbb/include/tbb/global_control.h b/contrib/libs/tbb/include/tbb/global_control.h
new file mode 100644
index 0000000000..2688996ecb
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/global_control.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/global_control.h"
diff --git a/contrib/libs/tbb/include/tbb/info.h b/contrib/libs/tbb/include/tbb/info.h
new file mode 100644
index 0000000000..02d331650e
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/info.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2019-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/info.h"
diff --git a/contrib/libs/tbb/include/tbb/memory_pool.h b/contrib/libs/tbb/include/tbb/memory_pool.h
new file mode 100644
index 0000000000..cefe96e36d
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/memory_pool.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/memory_pool.h"
diff --git a/contrib/libs/tbb/include/tbb/null_mutex.h b/contrib/libs/tbb/include/tbb/null_mutex.h
new file mode 100644
index 0000000000..63218bf061
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/null_mutex.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/null_mutex.h"
diff --git a/contrib/libs/tbb/include/tbb/null_rw_mutex.h b/contrib/libs/tbb/include/tbb/null_rw_mutex.h
new file mode 100644
index 0000000000..71c42fe26a
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/null_rw_mutex.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/null_rw_mutex.h"
diff --git a/contrib/libs/tbb/include/tbb/parallel_for.h b/contrib/libs/tbb/include/tbb/parallel_for.h
new file mode 100644
index 0000000000..fea1d1b9f5
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/parallel_for.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/parallel_for.h"
diff --git a/contrib/libs/tbb/include/tbb/parallel_for_each.h b/contrib/libs/tbb/include/tbb/parallel_for_each.h
new file mode 100644
index 0000000000..27c2ab1727
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/parallel_for_each.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/parallel_for_each.h"
diff --git a/contrib/libs/tbb/include/tbb/parallel_invoke.h b/contrib/libs/tbb/include/tbb/parallel_invoke.h
new file mode 100644
index 0000000000..6c21100e70
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/parallel_invoke.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/parallel_invoke.h"
diff --git a/contrib/libs/tbb/include/tbb/parallel_pipeline.h b/contrib/libs/tbb/include/tbb/parallel_pipeline.h
new file mode 100644
index 0000000000..aceee49f8a
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/parallel_pipeline.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/parallel_pipeline.h"
diff --git a/contrib/libs/tbb/include/tbb/parallel_reduce.h b/contrib/libs/tbb/include/tbb/parallel_reduce.h
new file mode 100644
index 0000000000..83658755a8
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/parallel_reduce.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/parallel_reduce.h"
diff --git a/contrib/libs/tbb/include/tbb/parallel_scan.h b/contrib/libs/tbb/include/tbb/parallel_scan.h
new file mode 100644
index 0000000000..682032a561
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/parallel_scan.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/parallel_scan.h"
diff --git a/contrib/libs/tbb/include/tbb/parallel_sort.h b/contrib/libs/tbb/include/tbb/parallel_sort.h
new file mode 100644
index 0000000000..b238e6caa4
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/parallel_sort.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/parallel_sort.h"
diff --git a/contrib/libs/tbb/include/tbb/partitioner.h b/contrib/libs/tbb/include/tbb/partitioner.h
new file mode 100644
index 0000000000..b959e35a2f
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/partitioner.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/partitioner.h"
diff --git a/contrib/libs/tbb/include/tbb/profiling.h b/contrib/libs/tbb/include/tbb/profiling.h
new file mode 100644
index 0000000000..c7cea9c590
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/profiling.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/profiling.h"
diff --git a/contrib/libs/tbb/include/tbb/queuing_mutex.h b/contrib/libs/tbb/include/tbb/queuing_mutex.h
new file mode 100644
index 0000000000..ad031e4eb7
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/queuing_mutex.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/queuing_mutex.h"
diff --git a/contrib/libs/tbb/include/tbb/queuing_rw_mutex.h b/contrib/libs/tbb/include/tbb/queuing_rw_mutex.h
new file mode 100644
index 0000000000..203727ccc5
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/queuing_rw_mutex.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/queuing_rw_mutex.h"
diff --git a/contrib/libs/tbb/include/tbb/scalable_allocator.h b/contrib/libs/tbb/include/tbb/scalable_allocator.h
new file mode 100644
index 0000000000..5c654ebd68
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/scalable_allocator.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/scalable_allocator.h"
diff --git a/contrib/libs/tbb/include/tbb/spin_mutex.h b/contrib/libs/tbb/include/tbb/spin_mutex.h
new file mode 100644
index 0000000000..1a6f7f077f
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/spin_mutex.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/spin_mutex.h"
diff --git a/contrib/libs/tbb/include/tbb/spin_rw_mutex.h b/contrib/libs/tbb/include/tbb/spin_rw_mutex.h
new file mode 100644
index 0000000000..d36282b486
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/spin_rw_mutex.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/spin_rw_mutex.h"
diff --git a/contrib/libs/tbb/include/tbb/task.h b/contrib/libs/tbb/include/tbb/task.h
new file mode 100644
index 0000000000..9be95b0d69
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/task.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/task.h"
diff --git a/contrib/libs/tbb/include/tbb/task_arena.h b/contrib/libs/tbb/include/tbb/task_arena.h
new file mode 100644
index 0000000000..f6e34b3e6d
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/task_arena.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/task_arena.h"
diff --git a/contrib/libs/tbb/include/tbb/task_group.h b/contrib/libs/tbb/include/tbb/task_group.h
new file mode 100644
index 0000000000..2f02503971
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/task_group.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/task_group.h"
diff --git a/contrib/libs/tbb/include/tbb/task_scheduler_observer.h b/contrib/libs/tbb/include/tbb/task_scheduler_observer.h
new file mode 100644
index 0000000000..9236f4cdf4
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/task_scheduler_observer.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/task_scheduler_observer.h"
diff --git a/contrib/libs/tbb/include/tbb/tbb.h b/contrib/libs/tbb/include/tbb/tbb.h
new file mode 100644
index 0000000000..e443b8f1ca
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/tbb.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb.h"
diff --git a/contrib/libs/tbb/include/tbb/tbb_allocator.h b/contrib/libs/tbb/include/tbb/tbb_allocator.h
new file mode 100644
index 0000000000..81ab9d33b5
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/tbb_allocator.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/tbb_allocator.h"
diff --git a/contrib/libs/tbb/include/tbb/tbbmalloc_proxy.h b/contrib/libs/tbb/include/tbb/tbbmalloc_proxy.h
new file mode 100644
index 0000000000..93eaa18e80
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/tbbmalloc_proxy.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/tbbmalloc_proxy.h"
diff --git a/contrib/libs/tbb/include/tbb/tick_count.h b/contrib/libs/tbb/include/tbb/tick_count.h
new file mode 100644
index 0000000000..170074aebb
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/tick_count.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/tick_count.h"
diff --git a/contrib/libs/tbb/include/tbb/version.h b/contrib/libs/tbb/include/tbb/version.h
new file mode 100644
index 0000000000..cd13a83a15
--- /dev/null
+++ b/contrib/libs/tbb/include/tbb/version.h
@@ -0,0 +1,17 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "../oneapi/tbb/version.h"
author	Devtools Arcadia <arcadia-devtools@yandex-team.ru>	2022-02-07 18:08:42 +0300
committer	Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>	2022-02-07 18:08:42 +0300
commit	1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
tree	e26c9fed0de5d9873cce7e00bc214573dc2195b7 /contrib/libs/tbb/include
download	ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz