intermediate changes

ref:cde9a383711a11544ce7e107a78147fb96cc4029
author: Devtools Arcadia <arcadia-devtools@yandex-team.ru> 2022-02-07 18:08:42 +0300
committer: Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> 2022-02-07 18:08:42 +0300
commit: 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
tree: e26c9fed0de5d9873cce7e00bc214573dc2195b7 /contrib/libs/tbb/src
download: ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
61 files changed, 21928 insertions, 0 deletions
diff --git a/contrib/libs/tbb/src/tbb/allocator.cpp b/contrib/libs/tbb/src/tbb/allocator.cpp
new file mode 100644
index 0000000000..6bf5a0be01
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/allocator.cpp
@@ -0,0 +1,234 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "oneapi/tbb/version.h"
+
+#include "oneapi/tbb/detail/_exception.h"
+#include "oneapi/tbb/detail/_assert.h"
+#include "oneapi/tbb/detail/_utils.h"
+
+#include "dynamic_link.h"
+#include "misc.h"
+
+#include <cstdlib>
+
+#if _WIN32 || _WIN64
+#include <Windows.h>
+#else
+#include <dlfcn.h>
+#endif /* _WIN32||_WIN64 */
+
+#if __TBB_WEAK_SYMBOLS_PRESENT
+
+#pragma weak scalable_malloc
+#pragma weak scalable_free
+#pragma weak scalable_aligned_malloc
+#pragma weak scalable_aligned_free
+
+extern "C" {
+    void* scalable_malloc(std::size_t);
+    void  scalable_free(void*);
+    void* scalable_aligned_malloc(std::size_t, std::size_t);
+    void  scalable_aligned_free(void*);
+}
+
+#endif /* __TBB_WEAK_SYMBOLS_PRESENT */
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//! Initialization routine used for first indirect call via allocate_handler.
+static void* initialize_allocate_handler(std::size_t size);
+
+//! Handler for memory allocation
+static void* (*allocate_handler)(std::size_t size) = &initialize_allocate_handler;
+
+//! Handler for memory deallocation
+static void  (*deallocate_handler)(void* pointer) = nullptr;
+
+//! Initialization routine used for first indirect call via cache_aligned_allocate_handler.
+static void* initialize_cache_aligned_allocate_handler(std::size_t n, std::size_t alignment);
+
+//! Allocates memory using standard malloc. It is used when scalable_allocator is not available
+static void* std_cache_aligned_allocate(std::size_t n, std::size_t alignment);
+
+//! Allocates memory using standard free. It is used when scalable_allocator is not available
+static void  std_cache_aligned_deallocate(void* p);
+
+//! Handler for padded memory allocation
+static void* (*cache_aligned_allocate_handler)(std::size_t n, std::size_t alignment) = &initialize_cache_aligned_allocate_handler;
+
+//! Handler for padded memory deallocation
+static void (*cache_aligned_deallocate_handler)(void* p) = nullptr;
+
+//! Table describing how to link the handlers.
+static const dynamic_link_descriptor MallocLinkTable[] = {
+    DLD(scalable_malloc, allocate_handler),
+    DLD(scalable_free, deallocate_handler),
+    DLD(scalable_aligned_malloc, cache_aligned_allocate_handler),
+    DLD(scalable_aligned_free, cache_aligned_deallocate_handler),
+};
+
+
+#if TBB_USE_DEBUG
+#define DEBUG_SUFFIX "_debug"
+#else
+#define DEBUG_SUFFIX
+#endif /* TBB_USE_DEBUG */
+
+// MALLOCLIB_NAME is the name of the oneTBB memory allocator library.
+#if _WIN32||_WIN64
+#define MALLOCLIB_NAME "tbbmalloc" DEBUG_SUFFIX ".dll"
+#elif __APPLE__
+#define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX ".dylib"
+#elif __FreeBSD__ || __NetBSD__ || __OpenBSD__ || __sun || _AIX || __ANDROID__
+#define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX ".so"
+#elif __linux__  // Note that order of these #elif's is important!
+#define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX ".so.2"
+#else
+#error Unknown OS
+#endif
+
+//! Initialize the allocation/free handler pointers.
+/** Caller is responsible for ensuring this routine is called exactly once.
+    The routine attempts to dynamically link with the TBB memory allocator.
+    If that allocator is not found, it links to malloc and free. */
+void initialize_handler_pointers() {
+    __TBB_ASSERT(allocate_handler == &initialize_allocate_handler, NULL);
+    bool success = dynamic_link(MALLOCLIB_NAME, MallocLinkTable, 4);
+    if(!success) {
+        // If unsuccessful, set the handlers to the default routines.
+        // This must be done now, and not before FillDynamicLinks runs, because if other
+        // threads call the handlers, we want them to go through the DoOneTimeInitializations logic,
+        // which forces them to wait.
+        allocate_handler = &std::malloc;
+        deallocate_handler = &std::free;
+        cache_aligned_allocate_handler = &std_cache_aligned_allocate;
+        cache_aligned_deallocate_handler = &std_cache_aligned_deallocate;
+    }
+
+    PrintExtraVersionInfo( "ALLOCATOR", success?"scalable_malloc":"malloc" );
+}
+
+static std::once_flag initialization_state;
+void initialize_cache_aligned_allocator() {
+    std::call_once(initialization_state, &initialize_handler_pointers);
+}
+
+//! Executed on very first call through allocate_handler
+static void* initialize_allocate_handler(std::size_t size) {
+    initialize_cache_aligned_allocator();
+    __TBB_ASSERT(allocate_handler != &initialize_allocate_handler, NULL);
+    return (*allocate_handler)(size);
+}
+
+//! Executed on very first call through cache_aligned_allocate_handler
+static void* initialize_cache_aligned_allocate_handler(std::size_t bytes, std::size_t alignment) {
+    initialize_cache_aligned_allocator();
+    __TBB_ASSERT(cache_aligned_allocate_handler != &initialize_cache_aligned_allocate_handler, NULL);
+    return (*cache_aligned_allocate_handler)(bytes, alignment);
+}
+
+// TODO: use CPUID to find actual line size, though consider backward compatibility
+// nfs - no false sharing
+static constexpr std::size_t nfs_size = 128;
+
+std::size_t __TBB_EXPORTED_FUNC cache_line_size() {
+    return nfs_size;
+}
+
+void* __TBB_EXPORTED_FUNC cache_aligned_allocate(std::size_t size) {
+    const std::size_t cache_line_size = nfs_size;
+    __TBB_ASSERT(is_power_of_two(cache_line_size), "must be power of two");
+
+    // Check for overflow
+    if (size + cache_line_size < size) {
+        throw_exception(exception_id::bad_alloc);
+    }
+    // scalable_aligned_malloc considers zero size request an error, and returns NULL
+    if (size == 0) size = 1;
+
+    void* result = cache_aligned_allocate_handler(size, cache_line_size);
+    if (!result) {
+        throw_exception(exception_id::bad_alloc);
+    }
+    __TBB_ASSERT(is_aligned(result, cache_line_size), "The returned address isn't aligned");
+    return result;
+}
+
+void __TBB_EXPORTED_FUNC cache_aligned_deallocate(void* p) {
+    __TBB_ASSERT(cache_aligned_deallocate_handler, "Initialization has not been yet.");
+    (*cache_aligned_deallocate_handler)(p);
+}
+
+static void* std_cache_aligned_allocate(std::size_t bytes, std::size_t alignment) {
+    // TODO: make it common with cache_aligned_resource
+    std::size_t space = alignment + bytes;
+    std::uintptr_t base = reinterpret_cast<std::uintptr_t>(std::malloc(space));
+    if (!base) {
+        return nullptr;
+    }
+    std::uintptr_t result = (base + nfs_size) & ~(nfs_size - 1);
+    // Round up to the next cache line (align the base address)
+    __TBB_ASSERT((result - base) >= sizeof(std::uintptr_t), "Cannot store a base pointer to the header");
+    __TBB_ASSERT(space - (result - base) >= bytes, "Not enough space for the storage");
+
+    // Record where block actually starts.
+    (reinterpret_cast<std::uintptr_t*>(result))[-1] = base;
+    return reinterpret_cast<void*>(result);
+}
+
+static void std_cache_aligned_deallocate(void* p) {
+    if (p) {
+        __TBB_ASSERT(reinterpret_cast<std::uintptr_t>(p) >= 0x4096, "attempt to free block not obtained from cache_aligned_allocator");
+        // Recover where block actually starts
+        std::uintptr_t base = (reinterpret_cast<std::uintptr_t*>(p))[-1];
+        __TBB_ASSERT(((base + nfs_size) & ~(nfs_size - 1)) == reinterpret_cast<std::uintptr_t>(p), "Incorrect alignment or not allocated by std_cache_aligned_deallocate?");
+        std::free(reinterpret_cast<void*>(base));
+    }
+}
+
+void* __TBB_EXPORTED_FUNC allocate_memory(std::size_t size) {
+    void* result = (*allocate_handler)(size);
+    if (!result) {
+        throw_exception(exception_id::bad_alloc);
+    }
+    return result;
+}
+
+void __TBB_EXPORTED_FUNC deallocate_memory(void* p) {
+    if (p) {
+        __TBB_ASSERT(deallocate_handler, "Initialization has not been yet.");
+        (*deallocate_handler)(p);
+    }
+}
+
+bool __TBB_EXPORTED_FUNC is_tbbmalloc_used() {
+    if (allocate_handler == &initialize_allocate_handler) {
+        void* void_ptr = allocate_handler(1);
+        deallocate_handler(void_ptr);
+    }
+    __TBB_ASSERT(allocate_handler != &initialize_allocate_handler && deallocate_handler != nullptr, NULL);
+    // Cast to void avoids type mismatch errors on some compilers (e.g. __IBMCPP__)
+    __TBB_ASSERT((reinterpret_cast<void*>(allocate_handler) == reinterpret_cast<void*>(&std::malloc)) == (reinterpret_cast<void*>(deallocate_handler) == reinterpret_cast<void*>(&std::free)),
+                  "Both shim pointers must refer to routines from the same package (either TBB or CRT)");
+    return reinterpret_cast<void*>(allocate_handler) == reinterpret_cast<void*>(&std::malloc);
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/contrib/libs/tbb/src/tbb/arena.cpp b/contrib/libs/tbb/src/tbb/arena.cpp
new file mode 100644
index 0000000000..1ddab36ff5
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/arena.cpp
@@ -0,0 +1,757 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "task_dispatcher.h"
+#include "governor.h"
+#include "arena.h"
+#include "itt_notify.h"
+#include "semaphore.h"
+#include "waiters.h"
+#include "oneapi/tbb/detail/_task.h"
+#include "oneapi/tbb/info.h"
+#include "oneapi/tbb/tbb_allocator.h"
+
+#include <atomic>
+#include <cstring>
+#include <functional>
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if __TBB_ARENA_BINDING
+class numa_binding_observer : public tbb::task_scheduler_observer {
+    binding_handler* my_binding_handler;
+public:
+    numa_binding_observer( d1::task_arena* ta, int num_slots, int numa_id, core_type_id core_type, int max_threads_per_core )
+        : task_scheduler_observer(*ta)
+        , my_binding_handler(construct_binding_handler(num_slots, numa_id, core_type, max_threads_per_core))
+    {}
+
+    void on_scheduler_entry( bool ) override {
+        apply_affinity_mask(my_binding_handler, this_task_arena::current_thread_index());
+    }
+
+    void on_scheduler_exit( bool ) override {
+        restore_affinity_mask(my_binding_handler, this_task_arena::current_thread_index());
+    }
+
+    ~numa_binding_observer(){
+        destroy_binding_handler(my_binding_handler);
+    }
+};
+
+numa_binding_observer* construct_binding_observer( d1::task_arena* ta, int num_slots, int numa_id, core_type_id core_type, int max_threads_per_core ) {
+    numa_binding_observer* binding_observer = nullptr;
+    if ((core_type >= 0 && core_type_count() > 1) || (numa_id >= 0 && numa_node_count() > 1) || max_threads_per_core > 0) {
+        binding_observer = new(allocate_memory(sizeof(numa_binding_observer))) numa_binding_observer(ta, num_slots, numa_id, core_type, max_threads_per_core);
+        __TBB_ASSERT(binding_observer, "Failure during NUMA binding observer allocation and construction");
+        binding_observer->observe(true);
+    }
+    return binding_observer;
+}
+
+void destroy_binding_observer( numa_binding_observer* binding_observer ) {
+    __TBB_ASSERT(binding_observer, "Trying to deallocate NULL pointer");
+    binding_observer->observe(false);
+    binding_observer->~numa_binding_observer();
+    deallocate_memory(binding_observer);
+}
+#endif /*!__TBB_ARENA_BINDING*/
+
+std::size_t arena::occupy_free_slot_in_range( thread_data& tls, std::size_t lower, std::size_t upper ) {
+    if ( lower >= upper ) return out_of_arena;
+    // Start search for an empty slot from the one we occupied the last time
+    std::size_t index = tls.my_arena_index;
+    if ( index < lower || index >= upper ) index = tls.my_random.get() % (upper - lower) + lower;
+    __TBB_ASSERT( index >= lower && index < upper, NULL );
+    // Find a free slot
+    for ( std::size_t i = index; i < upper; ++i )
+        if (my_slots[i].try_occupy()) return i;
+    for ( std::size_t i = lower; i < index; ++i )
+        if (my_slots[i].try_occupy()) return i;
+    return out_of_arena;
+}
+
+template <bool as_worker>
+std::size_t arena::occupy_free_slot(thread_data& tls) {
+    // Firstly, external threads try to occupy reserved slots
+    std::size_t index = as_worker ? out_of_arena : occupy_free_slot_in_range( tls,  0, my_num_reserved_slots );
+    if ( index == out_of_arena ) {
+        // Secondly, all threads try to occupy all non-reserved slots
+        index = occupy_free_slot_in_range(tls, my_num_reserved_slots, my_num_slots );
+        // Likely this arena is already saturated
+        if ( index == out_of_arena )
+            return out_of_arena;
+    }
+
+    atomic_update( my_limit, (unsigned)(index + 1), std::less<unsigned>() );
+    return index;
+}
+
+std::uintptr_t arena::calculate_stealing_threshold() {
+    stack_anchor_type anchor;
+    return r1::calculate_stealing_threshold(reinterpret_cast<std::uintptr_t>(&anchor), my_market->worker_stack_size());
+}
+
+void arena::process(thread_data& tls) {
+    governor::set_thread_data(tls); // TODO: consider moving to create_one_job.
+    __TBB_ASSERT( is_alive(my_guard), nullptr);
+    __TBB_ASSERT( my_num_slots > 1, nullptr);
+
+    std::size_t index = occupy_free_slot</*as_worker*/true>(tls);
+    if (index == out_of_arena) {
+        on_thread_leaving<ref_worker>();
+        return;
+    }
+    __TBB_ASSERT( index >= my_num_reserved_slots, "Workers cannot occupy reserved slots" );
+    tls.attach_arena(*this, index);
+
+    task_dispatcher& task_disp = tls.my_arena_slot->default_task_dispatcher();
+    task_disp.set_stealing_threshold(calculate_stealing_threshold());
+    __TBB_ASSERT(task_disp.can_steal(), nullptr);
+    tls.attach_task_dispatcher(task_disp);
+
+    __TBB_ASSERT( !tls.my_last_observer, "There cannot be notified local observers when entering arena" );
+    my_observers.notify_entry_observers(tls.my_last_observer, tls.my_is_worker);
+
+    // Waiting on special object tied to this arena
+    outermost_worker_waiter waiter(*this);
+    d1::task* t = tls.my_task_dispatcher->local_wait_for_all(nullptr, waiter);
+    __TBB_ASSERT_EX(t == nullptr, "Outermost worker must not leave dispatch loop with a task");
+    __TBB_ASSERT(governor::is_thread_data_set(&tls), nullptr);
+    __TBB_ASSERT(tls.my_task_dispatcher == &task_disp, nullptr);
+
+    my_observers.notify_exit_observers(tls.my_last_observer, tls.my_is_worker);
+    tls.my_last_observer = nullptr;
+
+    task_disp.set_stealing_threshold(0);
+    tls.detach_task_dispatcher();
+
+    // Arena slot detach (arena may be used in market::process)
+    // TODO: Consider moving several calls below into a new method(e.g.detach_arena).
+    tls.my_arena_slot->release();
+    tls.my_arena_slot = nullptr;
+    tls.my_inbox.detach();
+    __TBB_ASSERT(tls.my_inbox.is_idle_state(true), nullptr);
+    __TBB_ASSERT(is_alive(my_guard), nullptr);
+
+    // In contrast to earlier versions of TBB (before 3.0 U5) now it is possible
+    // that arena may be temporarily left unpopulated by threads. See comments in
+    // arena::on_thread_leaving() for more details.
+    on_thread_leaving<ref_worker>();
+    __TBB_ASSERT(tls.my_arena == this, "my_arena is used as a hint when searching the arena to join");
+}
+
+arena::arena ( market& m, unsigned num_slots, unsigned num_reserved_slots, unsigned priority_level )
+{
+    __TBB_ASSERT( !my_guard, "improperly allocated arena?" );
+    __TBB_ASSERT( sizeof(my_slots[0]) % cache_line_size()==0, "arena::slot size not multiple of cache line size" );
+    __TBB_ASSERT( is_aligned(this, cache_line_size()), "arena misaligned" );
+    my_market = &m;
+    my_limit = 1;
+    // Two slots are mandatory: for the external thread, and for 1 worker (required to support starvation resistant tasks).
+    my_num_slots = num_arena_slots(num_slots);
+    my_num_reserved_slots = num_reserved_slots;
+    my_max_num_workers = num_slots-num_reserved_slots;
+    my_priority_level = priority_level;
+    my_references = ref_external; // accounts for the external thread
+    my_aba_epoch = m.my_arenas_aba_epoch.load(std::memory_order_relaxed);
+    my_observers.my_arena = this;
+    my_co_cache.init(4 * num_slots);
+    __TBB_ASSERT ( my_max_num_workers <= my_num_slots, NULL );
+    // Initialize the default context. It should be allocated before task_dispatch construction.
+    my_default_ctx = new (cache_aligned_allocate(sizeof(d1::task_group_context)))
+        d1::task_group_context{ d1::task_group_context::isolated, d1::task_group_context::fp_settings };
+    // Construct slots. Mark internal synchronization elements for the tools.
+    task_dispatcher* base_td_pointer = reinterpret_cast<task_dispatcher*>(my_slots + my_num_slots);
+    for( unsigned i = 0; i < my_num_slots; ++i ) {
+        // __TBB_ASSERT( !my_slots[i].my_scheduler && !my_slots[i].task_pool, NULL );
+        __TBB_ASSERT( !my_slots[i].task_pool_ptr, NULL );
+        __TBB_ASSERT( !my_slots[i].my_task_pool_size, NULL );
+        mailbox(i).construct();
+        my_slots[i].init_task_streams(i);
+        my_slots[i].my_default_task_dispatcher = new(base_td_pointer + i) task_dispatcher(this);
+        my_slots[i].my_is_occupied.store(false, std::memory_order_relaxed);
+    }
+    my_fifo_task_stream.initialize(my_num_slots);
+    my_resume_task_stream.initialize(my_num_slots);
+#if __TBB_PREVIEW_CRITICAL_TASKS
+    my_critical_task_stream.initialize(my_num_slots);
+#endif
+#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY
+    my_local_concurrency_requests = 0;
+    my_local_concurrency_flag.clear();
+    my_global_concurrency_mode.store(false, std::memory_order_relaxed);
+#endif
+}
+
+arena& arena::allocate_arena( market& m, unsigned num_slots, unsigned num_reserved_slots,
+                              unsigned priority_level )
+{
+    __TBB_ASSERT( sizeof(base_type) + sizeof(arena_slot) == sizeof(arena), "All arena data fields must go to arena_base" );
+    __TBB_ASSERT( sizeof(base_type) % cache_line_size() == 0, "arena slots area misaligned: wrong padding" );
+    __TBB_ASSERT( sizeof(mail_outbox) == max_nfs_size, "Mailbox padding is wrong" );
+    std::size_t n = allocation_size(num_arena_slots(num_slots));
+    unsigned char* storage = (unsigned char*)cache_aligned_allocate(n);
+    // Zero all slots to indicate that they are empty
+    std::memset( storage, 0, n );
+    return *new( storage + num_arena_slots(num_slots) * sizeof(mail_outbox) )
+        arena(m, num_slots, num_reserved_slots, priority_level);
+}
+
+void arena::free_arena () {
+    __TBB_ASSERT( is_alive(my_guard), NULL );
+    __TBB_ASSERT( !my_references.load(std::memory_order_relaxed), "There are threads in the dying arena" );
+    __TBB_ASSERT( !my_num_workers_requested && !my_num_workers_allotted, "Dying arena requests workers" );
+    __TBB_ASSERT( my_pool_state.load(std::memory_order_relaxed) == SNAPSHOT_EMPTY || !my_max_num_workers,
+                  "Inconsistent state of a dying arena" );
+#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY
+    __TBB_ASSERT( !my_global_concurrency_mode, NULL );
+#endif
+    poison_value( my_guard );
+    std::intptr_t drained = 0;
+    for ( unsigned i = 0; i < my_num_slots; ++i ) {
+        // __TBB_ASSERT( !my_slots[i].my_scheduler, "arena slot is not empty" );
+        // TODO: understand the assertion and modify
+        // __TBB_ASSERT( my_slots[i].task_pool == EmptyTaskPool, NULL );
+        __TBB_ASSERT( my_slots[i].head == my_slots[i].tail, NULL ); // TODO: replace by is_quiescent_local_task_pool_empty
+        my_slots[i].free_task_pool();
+        drained += mailbox(i).drain();
+        my_slots[i].my_default_task_dispatcher->~task_dispatcher();
+    }
+    __TBB_ASSERT(my_fifo_task_stream.empty(), "Not all enqueued tasks were executed");
+    __TBB_ASSERT(my_resume_task_stream.empty(), "Not all enqueued tasks were executed");
+    // Cleanup coroutines/schedulers cache
+    my_co_cache.cleanup();
+    my_default_ctx->~task_group_context();
+    cache_aligned_deallocate(my_default_ctx);
+#if __TBB_PREVIEW_CRITICAL_TASKS
+    __TBB_ASSERT( my_critical_task_stream.empty(), "Not all critical tasks were executed");
+#endif
+    // remove an internal reference
+    my_market->release( /*is_public=*/false, /*blocking_terminate=*/false );
+    if ( !my_observers.empty() ) {
+        my_observers.clear();
+    }
+    void* storage  = &mailbox(my_num_slots-1);
+    __TBB_ASSERT( my_references.load(std::memory_order_relaxed) == 0, NULL );
+    __TBB_ASSERT( my_pool_state.load(std::memory_order_relaxed) == SNAPSHOT_EMPTY || !my_max_num_workers, NULL );
+    this->~arena();
+#if TBB_USE_ASSERT > 1
+    std::memset( storage, 0, allocation_size(my_num_slots) );
+#endif /* TBB_USE_ASSERT */
+    cache_aligned_deallocate( storage );
+}
+
+bool arena::has_enqueued_tasks() {
+    return !my_fifo_task_stream.empty();
+}
+
+bool arena::is_out_of_work() {
+#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY
+    if (my_local_concurrency_flag.try_clear_if([this] {
+        return !has_enqueued_tasks();
+    })) {
+        my_market->adjust_demand(*this, /* delta = */ -1, /* mandatory = */ true);
+    }
+#endif
+
+    // TODO: rework it to return at least a hint about where a task was found; better if the task itself.
+    switch (my_pool_state.load(std::memory_order_acquire)) {
+    case SNAPSHOT_EMPTY:
+        return true;
+    case SNAPSHOT_FULL: {
+        // Use unique id for "busy" in order to avoid ABA problems.
+        const pool_state_t busy = pool_state_t(&busy);
+        // Helper for CAS execution
+        pool_state_t expected_state;
+
+        // Request permission to take snapshot
+        expected_state = SNAPSHOT_FULL;
+        if (my_pool_state.compare_exchange_strong(expected_state, busy)) {
+            // Got permission. Take the snapshot.
+            // NOTE: This is not a lock, as the state can be set to FULL at
+            //       any moment by a thread that spawns/enqueues new task.
+            std::size_t n = my_limit.load(std::memory_order_acquire);
+            // Make local copies of volatile parameters. Their change during
+            // snapshot taking procedure invalidates the attempt, and returns
+            // this thread into the dispatch loop.
+            std::size_t k;
+            for (k = 0; k < n; ++k) {
+                if (my_slots[k].task_pool.load(std::memory_order_relaxed) != EmptyTaskPool &&
+                    my_slots[k].head.load(std::memory_order_relaxed) < my_slots[k].tail.load(std::memory_order_relaxed))
+                {
+                    // k-th primary task pool is nonempty and does contain tasks.
+                    break;
+                }
+                if (my_pool_state.load(std::memory_order_acquire) != busy)
+                    return false; // the work was published
+            }
+            bool work_absent = k == n;
+            // Test and test-and-set.
+            if (my_pool_state.load(std::memory_order_acquire) == busy) {
+                bool no_stream_tasks = !has_enqueued_tasks() && my_resume_task_stream.empty();
+#if __TBB_PREVIEW_CRITICAL_TASKS
+                no_stream_tasks = no_stream_tasks && my_critical_task_stream.empty();
+#endif
+                work_absent = work_absent && no_stream_tasks;
+                if (work_absent) {
+                    // save current demand value before setting SNAPSHOT_EMPTY,
+                    // to avoid race with advertise_new_work.
+                    int current_demand = (int)my_max_num_workers;
+                    expected_state = busy;
+                    if (my_pool_state.compare_exchange_strong(expected_state, SNAPSHOT_EMPTY)) {
+                        // This thread transitioned pool to empty state, and thus is
+                        // responsible for telling the market that there is no work to do.
+                        my_market->adjust_demand(*this, -current_demand, /* mandatory = */ false);
+                        return true;
+                    }
+                    return false;
+                }
+                // Undo previous transition SNAPSHOT_FULL-->busy, unless another thread undid it.
+                expected_state = busy;
+                my_pool_state.compare_exchange_strong(expected_state, SNAPSHOT_FULL);
+            }
+        }
+        return false;
+    }
+    default:
+        // Another thread is taking a snapshot.
+        return false;
+    }
+}
+
+void arena::enqueue_task(d1::task& t, d1::task_group_context& ctx, thread_data& td) {
+    task_group_context_impl::bind_to(ctx, &td);
+    task_accessor::context(t) = &ctx;
+    task_accessor::isolation(t) = no_isolation;
+    my_fifo_task_stream.push( &t, random_lane_selector(td.my_random) );
+    advertise_new_work<work_enqueued>();
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+// Enable task_arena.h
+#include "oneapi/tbb/task_arena.h" // task_arena_base
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if TBB_USE_ASSERT
+void assert_arena_priority_valid( tbb::task_arena::priority a_priority ) {
+    bool is_arena_priority_correct =
+        a_priority == tbb::task_arena::priority::high   ||
+        a_priority == tbb::task_arena::priority::normal ||
+        a_priority == tbb::task_arena::priority::low;
+    __TBB_ASSERT( is_arena_priority_correct,
+                  "Task arena priority should be equal to one of the predefined values." );
+}
+#else
+void assert_arena_priority_valid( tbb::task_arena::priority ) {}
+#endif
+
+unsigned arena_priority_level( tbb::task_arena::priority a_priority ) {
+    assert_arena_priority_valid( a_priority );
+    return market::num_priority_levels - unsigned(int(a_priority) / d1::priority_stride);
+}
+
+tbb::task_arena::priority arena_priority( unsigned priority_level ) {
+    auto priority = tbb::task_arena::priority(
+        (market::num_priority_levels - priority_level) * d1::priority_stride
+    );
+    assert_arena_priority_valid( priority );
+    return priority;
+}
+
+struct task_arena_impl {
+    static void initialize(d1::task_arena_base&);
+    static void terminate(d1::task_arena_base&);
+    static bool attach(d1::task_arena_base&);
+    static void execute(d1::task_arena_base&, d1::delegate_base&);
+    static void wait(d1::task_arena_base&);
+    static int max_concurrency(const d1::task_arena_base*);
+    static void enqueue(d1::task&, d1::task_arena_base*);
+};
+
+void __TBB_EXPORTED_FUNC initialize(d1::task_arena_base& ta) {
+    task_arena_impl::initialize(ta);
+}
+void __TBB_EXPORTED_FUNC terminate(d1::task_arena_base& ta) {
+    task_arena_impl::terminate(ta);
+}
+bool __TBB_EXPORTED_FUNC attach(d1::task_arena_base& ta) {
+    return task_arena_impl::attach(ta);
+}
+void __TBB_EXPORTED_FUNC execute(d1::task_arena_base& ta, d1::delegate_base& d) {
+    task_arena_impl::execute(ta, d);
+}
+void __TBB_EXPORTED_FUNC wait(d1::task_arena_base& ta) {
+    task_arena_impl::wait(ta);
+}
+
+int __TBB_EXPORTED_FUNC max_concurrency(const d1::task_arena_base* ta) {
+    return task_arena_impl::max_concurrency(ta);
+}
+
+void __TBB_EXPORTED_FUNC enqueue(d1::task& t, d1::task_arena_base* ta) {
+    task_arena_impl::enqueue(t, ta);
+}
+
+void task_arena_impl::initialize(d1::task_arena_base& ta) {
+    governor::one_time_init();
+    if (ta.my_max_concurrency < 1) {
+#if __TBB_ARENA_BINDING
+
+#if __TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION_PRESENT
+        d1::constraints arena_constraints = d1::constraints{}
+            .set_core_type(ta.core_type())
+            .set_max_threads_per_core(ta.max_threads_per_core())
+            .set_numa_id(ta.my_numa_id);
+        ta.my_max_concurrency = (int)default_concurrency(arena_constraints);
+#else /*!__TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION_PRESENT*/
+        ta.my_max_concurrency = (int)default_concurrency(ta.my_numa_id);
+#endif /*!__TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION_PRESENT*/
+
+#else /*!__TBB_ARENA_BINDING*/
+        ta.my_max_concurrency = (int)governor::default_num_threads();
+#endif /*!__TBB_ARENA_BINDING*/
+    }
+
+    __TBB_ASSERT(ta.my_arena.load(std::memory_order_relaxed) == nullptr, "Arena already initialized");
+    unsigned priority_level = arena_priority_level(ta.my_priority);
+    arena* a = market::create_arena(ta.my_max_concurrency, ta.my_num_reserved_slots, priority_level, /* stack_size = */ 0);
+    ta.my_arena.store(a, std::memory_order_release);
+    // add an internal market reference; a public reference was added in create_arena
+    market::global_market( /*is_public=*/false);
+#if __TBB_ARENA_BINDING
+    a->my_numa_binding_observer = construct_binding_observer(
+        static_cast<d1::task_arena*>(&ta), a->my_num_slots, ta.my_numa_id, ta.core_type(), ta.max_threads_per_core());
+#endif /*__TBB_ARENA_BINDING*/
+}
+
+void task_arena_impl::terminate(d1::task_arena_base& ta) {
+    arena* a = ta.my_arena.load(std::memory_order_relaxed);
+    assert_pointer_valid(a);
+#if __TBB_ARENA_BINDING
+    if(a->my_numa_binding_observer != nullptr ) {
+        destroy_binding_observer(a->my_numa_binding_observer);
+        a->my_numa_binding_observer = nullptr;
+    }
+#endif /*__TBB_ARENA_BINDING*/
+    a->my_market->release( /*is_public=*/true, /*blocking_terminate=*/false );
+    a->on_thread_leaving<arena::ref_external>();
+    ta.my_arena.store(nullptr, std::memory_order_relaxed);
+}
+
+bool task_arena_impl::attach(d1::task_arena_base& ta) {
+    __TBB_ASSERT(!ta.my_arena.load(std::memory_order_relaxed), nullptr);
+    thread_data* td = governor::get_thread_data_if_initialized();
+    if( td && td->my_arena ) {
+        arena* a = td->my_arena;
+        // There is an active arena to attach to.
+        // It's still used by s, so won't be destroyed right away.
+        __TBB_ASSERT(a->my_references > 0, NULL );
+        a->my_references += arena::ref_external;
+        ta.my_num_reserved_slots = a->my_num_reserved_slots;
+        ta.my_priority = arena_priority(a->my_priority_level);
+        ta.my_max_concurrency = ta.my_num_reserved_slots + a->my_max_num_workers;
+        __TBB_ASSERT(arena::num_arena_slots(ta.my_max_concurrency) == a->my_num_slots, NULL);
+        ta.my_arena.store(a, std::memory_order_release);
+        // increases market's ref count for task_arena
+        market::global_market( /*is_public=*/true );
+        return true;
+    }
+    return false;
+}
+
+void task_arena_impl::enqueue(d1::task& t, d1::task_arena_base* ta) {
+    thread_data* td = governor::get_thread_data();  // thread data is only needed for FastRandom instance
+    arena* a = ta->my_arena.load(std::memory_order_relaxed);
+    assert_pointers_valid(ta, a, a->my_default_ctx, td);
+    // Is there a better place for checking the state of my_default_ctx?
+     __TBB_ASSERT(!a->my_default_ctx->is_group_execution_cancelled(),
+                  "The task will not be executed because default task_group_context of task_arena is cancelled. Has previously enqueued task thrown an exception?");
+     a->enqueue_task(t, *a->my_default_ctx, *td);
+}
+
+class nested_arena_context : no_copy {
+public:
+    nested_arena_context(thread_data& td, arena& nested_arena, std::size_t slot_index)
+        : m_orig_execute_data_ext(td.my_task_dispatcher->m_execute_data_ext)
+    {
+        if (td.my_arena != &nested_arena) {
+            m_orig_arena = td.my_arena;
+            m_orig_slot_index = td.my_arena_index;
+            m_orig_last_observer = td.my_last_observer;
+
+            td.detach_task_dispatcher();
+            td.attach_arena(nested_arena, slot_index);
+            task_dispatcher& task_disp = td.my_arena_slot->default_task_dispatcher();
+            task_disp.set_stealing_threshold(m_orig_execute_data_ext.task_disp->m_stealing_threshold);
+            td.attach_task_dispatcher(task_disp);
+
+            // If the calling thread occupies the slots out of external thread reserve we need to notify the
+            // market that this arena requires one worker less.
+            if (td.my_arena_index >= td.my_arena->my_num_reserved_slots) {
+                td.my_arena->my_market->adjust_demand(*td.my_arena, /* delta = */ -1, /* mandatory = */ false);
+            }
+
+            td.my_last_observer = nullptr;
+            // The task_arena::execute method considers each calling thread as an external thread.
+            td.my_arena->my_observers.notify_entry_observers(td.my_last_observer, /* worker*/false);
+        }
+
+        m_task_dispatcher = td.my_task_dispatcher;
+        m_orig_fifo_tasks_allowed = m_task_dispatcher->allow_fifo_task(true);
+        m_orig_critical_task_allowed = m_task_dispatcher->m_properties.critical_task_allowed;
+        m_task_dispatcher->m_properties.critical_task_allowed = true;
+
+        execution_data_ext& ed_ext = td.my_task_dispatcher->m_execute_data_ext;
+        ed_ext.context = td.my_arena->my_default_ctx;
+        ed_ext.original_slot = td.my_arena_index;
+        ed_ext.affinity_slot = d1::no_slot;
+        ed_ext.task_disp = td.my_task_dispatcher;
+        ed_ext.isolation = no_isolation;
+
+        __TBB_ASSERT(td.my_arena_slot, nullptr);
+        __TBB_ASSERT(td.my_arena_slot->is_occupied(), nullptr);
+        __TBB_ASSERT(td.my_task_dispatcher, nullptr);
+    }
+    ~nested_arena_context() {
+        thread_data& td = *m_task_dispatcher->m_thread_data;
+        __TBB_ASSERT(governor::is_thread_data_set(&td), nullptr);
+        m_task_dispatcher->allow_fifo_task(m_orig_fifo_tasks_allowed);
+        m_task_dispatcher->m_properties.critical_task_allowed = m_orig_critical_task_allowed;
+        if (m_orig_arena) {
+            td.my_arena->my_observers.notify_exit_observers(td.my_last_observer, /*worker*/ false);
+            td.my_last_observer = m_orig_last_observer;
+
+            // Notify the market that this thread releasing a one slot
+            // that can be used by a worker thread.
+            if (td.my_arena_index >= td.my_arena->my_num_reserved_slots) {
+                td.my_arena->my_market->adjust_demand(*td.my_arena, /* delta = */ 1, /* mandatory = */ false);
+            }
+
+            td.my_task_dispatcher->set_stealing_threshold(0);
+            td.detach_task_dispatcher();
+            td.my_arena_slot->release();
+            td.my_arena->my_exit_monitors.notify_one(); // do not relax!
+
+            td.attach_arena(*m_orig_arena, m_orig_slot_index);
+            td.attach_task_dispatcher(*m_orig_execute_data_ext.task_disp);
+        }
+        td.my_task_dispatcher->m_execute_data_ext = m_orig_execute_data_ext;
+    }
+
+private:
+    execution_data_ext    m_orig_execute_data_ext{};
+    arena*              m_orig_arena{ nullptr };
+    observer_proxy*     m_orig_last_observer{ nullptr };
+    task_dispatcher*    m_task_dispatcher{ nullptr };
+    unsigned            m_orig_slot_index{};
+    bool                m_orig_fifo_tasks_allowed{};
+    bool                m_orig_critical_task_allowed{};
+};
+
+class delegated_task : public d1::task {
+    d1::delegate_base&  m_delegate;
+    concurrent_monitor& m_monitor;
+    d1::wait_context&   m_wait_ctx;
+    std::atomic<bool>   m_completed;
+    d1::task* execute(d1::execution_data& ed) override {
+        const execution_data_ext& ed_ext = static_cast<const execution_data_ext&>(ed);
+        execution_data_ext orig_execute_data_ext = ed_ext.task_disp->m_execute_data_ext;
+        __TBB_ASSERT(&ed_ext.task_disp->m_execute_data_ext == &ed,
+            "The execute data shall point to the current task dispatcher execute data");
+        __TBB_ASSERT(ed_ext.task_disp->m_execute_data_ext.isolation == no_isolation, nullptr);
+
+        ed_ext.task_disp->m_execute_data_ext.context = ed_ext.task_disp->get_thread_data().my_arena->my_default_ctx;
+        bool fifo_task_allowed = ed_ext.task_disp->allow_fifo_task(true);
+        try_call([&] {
+            m_delegate();
+        }).on_completion([&] {
+            ed_ext.task_disp->m_execute_data_ext = orig_execute_data_ext;
+            ed_ext.task_disp->allow_fifo_task(fifo_task_allowed);
+        });
+
+        finalize();
+        return nullptr;
+    }
+    d1::task* cancel(d1::execution_data&) override {
+        finalize();
+        return nullptr;
+    }
+    void finalize() {
+        m_wait_ctx.release(); // must precede the wakeup
+        m_monitor.notify([this](std::uintptr_t ctx) {
+            return ctx == std::uintptr_t(&m_delegate);
+        }); // do not relax, it needs a fence!
+        m_completed.store(true, std::memory_order_release);
+    }
+public:
+    delegated_task(d1::delegate_base& d, concurrent_monitor& s, d1::wait_context& wo)
+        : m_delegate(d), m_monitor(s), m_wait_ctx(wo), m_completed{ false }{}
+    ~delegated_task() {
+        // The destructor can be called earlier than the m_monitor is notified
+        // because the waiting thread can be released after m_wait_ctx.release_wait.
+        // To close that race we wait for the m_completed signal.
+        spin_wait_until_eq(m_completed, true);
+    }
+};
+
+void task_arena_impl::execute(d1::task_arena_base& ta, d1::delegate_base& d) {
+    arena* a = ta.my_arena.load(std::memory_order_relaxed);
+    __TBB_ASSERT(a != nullptr, nullptr);
+    thread_data* td = governor::get_thread_data();
+
+    bool same_arena = td->my_arena == a;
+    std::size_t index1 = td->my_arena_index;
+    if (!same_arena) {
+        index1 = a->occupy_free_slot</*as_worker */false>(*td);
+        if (index1 == arena::out_of_arena) {
+            concurrent_monitor::thread_context waiter((std::uintptr_t)&d);
+            d1::wait_context wo(1);
+            d1::task_group_context exec_context(d1::task_group_context::isolated);
+            task_group_context_impl::copy_fp_settings(exec_context, *a->my_default_ctx);
+
+            delegated_task dt(d, a->my_exit_monitors, wo);
+            a->enqueue_task( dt, exec_context, *td);
+            size_t index2 = arena::out_of_arena;
+            do {
+                a->my_exit_monitors.prepare_wait(waiter);
+                if (!wo.continue_execution()) {
+                    a->my_exit_monitors.cancel_wait(waiter);
+                    break;
+                }
+                index2 = a->occupy_free_slot</*as_worker*/false>(*td);
+                if (index2 != arena::out_of_arena) {
+                    a->my_exit_monitors.cancel_wait(waiter);
+                    nested_arena_context scope(*td, *a, index2 );
+                    r1::wait(wo, exec_context);
+                    __TBB_ASSERT(!exec_context.my_exception, NULL); // exception can be thrown above, not deferred
+                    break;
+                }
+                a->my_exit_monitors.commit_wait(waiter);
+            } while (wo.continue_execution());
+            if (index2 == arena::out_of_arena) {
+                // notify a waiting thread even if this thread did not enter arena,
+                // in case it was woken by a leaving thread but did not need to enter
+                a->my_exit_monitors.notify_one(); // do not relax!
+            }
+            // process possible exception
+            if (exec_context.my_exception) {
+                __TBB_ASSERT(exec_context.is_group_execution_cancelled(), "The task group context with an exception should be canceled.");
+                exec_context.my_exception->throw_self();
+            }
+            __TBB_ASSERT(governor::is_thread_data_set(td), nullptr);
+            return;
+        } // if (index1 == arena::out_of_arena)
+    } // if (!same_arena)
+
+    context_guard_helper</*report_tasks=*/false> context_guard;
+    context_guard.set_ctx(a->my_default_ctx);
+    nested_arena_context scope(*td, *a, index1);
+#if _WIN64
+    try {
+#endif
+        d();
+        __TBB_ASSERT(same_arena || governor::is_thread_data_set(td), nullptr);
+#if _WIN64
+    } catch (...) {
+        context_guard.restore_default();
+        throw;
+    }
+#endif
+}
+
+void task_arena_impl::wait(d1::task_arena_base& ta) {
+    arena* a = ta.my_arena.load(std::memory_order_relaxed);
+    __TBB_ASSERT(a != nullptr, nullptr);
+    thread_data* td = governor::get_thread_data();
+    __TBB_ASSERT_EX(td, "Scheduler is not initialized");
+    __TBB_ASSERT(td->my_arena != a || td->my_arena_index == 0, "internal_wait is not supported within a worker context" );
+    if (a->my_max_num_workers != 0) {
+        while (a->num_workers_active() || a->my_pool_state.load(std::memory_order_acquire) != arena::SNAPSHOT_EMPTY) {
+            yield();
+        }
+    }
+}
+
+int task_arena_impl::max_concurrency(const d1::task_arena_base *ta) {
+    arena* a = nullptr;
+    if( ta ) // for special cases of ta->max_concurrency()
+        a = ta->my_arena.load(std::memory_order_relaxed);
+    else if( thread_data* td = governor::get_thread_data_if_initialized() )
+        a = td->my_arena; // the current arena if any
+
+    if( a ) { // Get parameters from the arena
+        __TBB_ASSERT( !ta || ta->my_max_concurrency==1, NULL );
+        return a->my_num_reserved_slots + a->my_max_num_workers
+#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY
+            + (a->my_local_concurrency_flag.test() ? 1 : 0)
+#endif
+            ;
+    }
+
+    if (ta && ta->my_max_concurrency == 1) {
+        return 1;
+    }
+
+#if __TBB_ARENA_BINDING
+    if (ta) {
+#if __TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION_PRESENT
+        d1::constraints arena_constraints = d1::constraints{}
+            .set_numa_id(ta->my_numa_id)
+            .set_core_type(ta->core_type())
+            .set_max_threads_per_core(ta->max_threads_per_core());
+        return (int)default_concurrency(arena_constraints);
+#else /*!__TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION_PRESENT*/
+        return (int)default_concurrency(ta->my_numa_id);
+#endif /*!__TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION_PRESENT*/
+    }
+#endif /*!__TBB_ARENA_BINDING*/
+
+    __TBB_ASSERT(!ta || ta->my_max_concurrency==d1::task_arena_base::automatic, NULL );
+    return int(governor::default_num_threads());
+}
+
+void isolate_within_arena(d1::delegate_base& d, std::intptr_t isolation) {
+    // TODO: Decide what to do if the scheduler is not initialized. Is there a use case for it?
+    thread_data* tls = governor::get_thread_data();
+    assert_pointers_valid(tls, tls->my_task_dispatcher);
+    task_dispatcher* dispatcher = tls->my_task_dispatcher;
+    isolation_type previous_isolation = dispatcher->m_execute_data_ext.isolation;
+    try_call([&] {
+        // We temporarily change the isolation tag of the currently running task. It will be restored in the destructor of the guard.
+        isolation_type current_isolation = isolation ? isolation : reinterpret_cast<isolation_type>(&d);
+        // Save the current isolation value and set new one
+        previous_isolation = dispatcher->set_isolation(current_isolation);
+        // Isolation within this callable
+        d();
+    }).on_completion([&] {
+        __TBB_ASSERT(governor::get_thread_data()->my_task_dispatcher == dispatcher, NULL);
+        dispatcher->set_isolation(previous_isolation);
+    });
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
diff --git a/contrib/libs/tbb/src/tbb/arena.h b/contrib/libs/tbb/src/tbb/arena.h
new file mode 100644
index 0000000000..b1b9c3dc93
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/arena.h
@@ -0,0 +1,616 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_arena_H
+#define _TBB_arena_H
+
+#include <atomic>
+#include <cstring>
+
+#include "oneapi/tbb/detail/_task.h"
+
+#include "scheduler_common.h"
+#include "intrusive_list.h"
+#include "task_stream.h"
+#include "arena_slot.h"
+#include "rml_tbb.h"
+#include "mailbox.h"
+#include "market.h"
+#include "governor.h"
+#include "concurrent_monitor.h"
+#include "observer_proxy.h"
+#include "oneapi/tbb/spin_mutex.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class task_dispatcher;
+class task_group_context;
+class allocate_root_with_context_proxy;
+
+#if __TBB_ARENA_BINDING
+class numa_binding_observer;
+#endif /*__TBB_ARENA_BINDING*/
+
+//! Bounded coroutines cache LIFO ring buffer
+class arena_co_cache {
+    //! Ring buffer storage
+    task_dispatcher** my_co_scheduler_cache;
+    //! Current cache index
+    unsigned my_head;
+    //! Cache capacity for arena
+    unsigned my_max_index;
+    //! Accessor lock for modification operations
+    tbb::spin_mutex my_co_cache_mutex;
+
+    unsigned next_index() {
+        return ( my_head == my_max_index ) ? 0 : my_head + 1;
+    }
+
+    unsigned prev_index() {
+        return ( my_head == 0 ) ? my_max_index : my_head - 1;
+    }
+
+    bool internal_empty() {
+        return my_co_scheduler_cache[prev_index()] == nullptr;
+    }
+
+    void internal_task_dispatcher_cleanup(task_dispatcher* to_cleanup) {
+        to_cleanup->~task_dispatcher();
+        cache_aligned_deallocate(to_cleanup);
+    }
+
+public:
+    void init(unsigned cache_capacity) {
+        std::size_t alloc_size = cache_capacity * sizeof(task_dispatcher*);
+        my_co_scheduler_cache = (task_dispatcher**)cache_aligned_allocate(alloc_size);
+        std::memset( my_co_scheduler_cache, 0, alloc_size );
+        my_head = 0;
+        my_max_index = cache_capacity - 1;
+    }
+
+    void cleanup() {
+        while (task_dispatcher* to_cleanup = pop()) {
+            internal_task_dispatcher_cleanup(to_cleanup);
+        }
+        cache_aligned_deallocate(my_co_scheduler_cache);
+    }
+
+    //! Insert scheduler to the current available place.
+    //! Replace an old value, if necessary.
+    void push(task_dispatcher* s) {
+        task_dispatcher* to_cleanup = nullptr;
+        {
+            tbb::spin_mutex::scoped_lock lock(my_co_cache_mutex);
+            // Check if we are replacing some existing buffer entrance
+            if (my_co_scheduler_cache[my_head] != nullptr) {
+                to_cleanup = my_co_scheduler_cache[my_head];
+            }
+            // Store the cached value
+            my_co_scheduler_cache[my_head] = s;
+            // Move head index to the next slot
+            my_head = next_index();
+        }
+        // Cleanup replaced buffer if any
+        if (to_cleanup) {
+            internal_task_dispatcher_cleanup(to_cleanup);
+        }
+    }
+
+    //! Get a cached scheduler if any
+    task_dispatcher* pop() {
+        tbb::spin_mutex::scoped_lock lock(my_co_cache_mutex);
+        // No cached coroutine
+        if (internal_empty()) {
+            return nullptr;
+        }
+        // Move head index to the currently available value
+        my_head = prev_index();
+        // Retrieve the value from the buffer
+        task_dispatcher* to_return = my_co_scheduler_cache[my_head];
+        // Clear the previous entrance value
+        my_co_scheduler_cache[my_head] = nullptr;
+        return to_return;
+    }
+};
+
+struct stack_anchor_type {
+    stack_anchor_type() = default;
+    stack_anchor_type(const stack_anchor_type&) = delete;
+};
+
+#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY
+class atomic_flag {
+    static const std::uintptr_t SET = 1;
+    static const std::uintptr_t EMPTY = 0;
+    std::atomic<std::uintptr_t> my_state;
+public:
+    bool test_and_set() {
+        std::uintptr_t state = my_state.load(std::memory_order_acquire);
+        switch (state) {
+        case SET:
+            return false;
+        default: /* busy */
+            if (my_state.compare_exchange_strong(state, SET)) {
+                // We interrupted clear transaction
+                return false;
+            }
+            if (state != EMPTY) {
+                // We lost our epoch
+                return false;
+            }
+            // We are too late but still in the same epoch
+            __TBB_fallthrough;
+        case EMPTY:
+            return my_state.compare_exchange_strong(state, SET);
+        }
+    }
+    template <typename Pred>
+    bool try_clear_if(Pred&& pred) {
+        std::uintptr_t busy = std::uintptr_t(&busy);
+        std::uintptr_t state = my_state.load(std::memory_order_acquire);
+        if (state == SET && my_state.compare_exchange_strong(state, busy)) {
+            if (pred()) {
+                return my_state.compare_exchange_strong(busy, EMPTY);
+            }
+            // The result of the next operation is discarded, always false should be returned.
+            my_state.compare_exchange_strong(busy, SET);
+        }
+        return false;
+    }
+    void clear() {
+        my_state.store(EMPTY, std::memory_order_release);
+    }
+    bool test() {
+        return my_state.load(std::memory_order_acquire) != EMPTY;
+    }
+};
+#endif
+
+//! The structure of an arena, except the array of slots.
+/** Separated in order to simplify padding.
+    Intrusive list node base class is used by market to form a list of arenas. **/
+struct arena_base : padded<intrusive_list_node> {
+    //! The number of workers that have been marked out by the resource manager to service the arena.
+    std::atomic<unsigned> my_num_workers_allotted;   // heavy use in stealing loop
+
+    //! Reference counter for the arena.
+    /** Worker and external thread references are counted separately: first several bits are for references
+        from external thread threads or explicit task_arenas (see arena::ref_external_bits below);
+        the rest counts the number of workers servicing the arena. */
+    std::atomic<unsigned> my_references;     // heavy use in stealing loop
+
+    //! The maximal number of currently busy slots.
+    std::atomic<unsigned> my_limit;          // heavy use in stealing loop
+
+    //! Task pool for the tasks scheduled via task::enqueue() method.
+    /** Such scheduling guarantees eventual execution even if
+        - new tasks are constantly coming (by extracting scheduled tasks in
+          relaxed FIFO order);
+        - the enqueuing thread does not call any of wait_for_all methods. **/
+    task_stream<front_accessor> my_fifo_task_stream; // heavy use in stealing loop
+
+    //! Task pool for the tasks scheduled via tbb::resume() function.
+    task_stream<front_accessor> my_resume_task_stream; // heavy use in stealing loop
+
+#if __TBB_PREVIEW_CRITICAL_TASKS
+    //! Task pool for the tasks with critical property set.
+    /** Critical tasks are scheduled for execution ahead of other sources (including local task pool
+        and even bypassed tasks) unless the thread already executes a critical task in an outer
+        dispatch loop **/
+    // used on the hot path of the task dispatch loop
+    task_stream<back_nonnull_accessor> my_critical_task_stream;
+#endif
+
+    //! The number of workers requested by the external thread owning the arena.
+    unsigned my_max_num_workers;
+
+    //! The total number of workers that are requested from the resource manager.
+    int my_total_num_workers_requested;
+
+    //! The number of workers that are really requested from the resource manager.
+    //! Possible values are in [0, my_max_num_workers]
+    int my_num_workers_requested;
+
+    //! The index in the array of per priority lists of arenas this object is in.
+    /*const*/ unsigned my_priority_level;
+
+    //! The max priority level of arena in market.
+    std::atomic<bool> my_is_top_priority{false};
+
+    //! Current task pool state and estimate of available tasks amount.
+    /** The estimate is either 0 (SNAPSHOT_EMPTY) or infinity (SNAPSHOT_FULL).
+        Special state is "busy" (any other unsigned value).
+        Note that the implementation of arena::is_busy_or_empty() requires
+        my_pool_state to be unsigned. */
+    using pool_state_t = std::uintptr_t ;
+    std::atomic<pool_state_t> my_pool_state;
+
+    //! The list of local observers attached to this arena.
+    observer_list my_observers;
+
+#if __TBB_ARENA_BINDING
+    //! Pointer to internal observer that allows to bind threads in arena to certain NUMA node.
+    numa_binding_observer* my_numa_binding_observer;
+#endif /*__TBB_ARENA_BINDING*/
+
+    // Below are rarely modified members
+
+    //! The market that owns this arena.
+    market* my_market;
+
+    //! ABA prevention marker.
+    std::uintptr_t my_aba_epoch;
+
+    //! Default task group context.
+    d1::task_group_context* my_default_ctx;
+
+    //! The number of slots in the arena.
+    unsigned my_num_slots;
+
+    //! The number of reserved slots (can be occupied only by external threads).
+    unsigned my_num_reserved_slots;
+
+#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY
+    // arena needs an extra worker despite the arena limit
+    atomic_flag my_local_concurrency_flag;
+    // the number of local mandatory concurrency requests
+    int my_local_concurrency_requests;
+    // arena needs an extra worker despite a global limit
+    std::atomic<bool> my_global_concurrency_mode;
+#endif /* __TBB_ENQUEUE_ENFORCED_CONCURRENCY */
+
+    //! Waiting object for external threads that cannot join the arena.
+    concurrent_monitor my_exit_monitors;
+
+    //! Coroutines (task_dispathers) cache buffer
+    arena_co_cache my_co_cache;
+
+#if TBB_USE_ASSERT
+    //! Used to trap accesses to the object after its destruction.
+    std::uintptr_t my_guard;
+#endif /* TBB_USE_ASSERT */
+}; // struct arena_base
+
+class arena: public padded<arena_base>
+{
+public:
+    using base_type = padded<arena_base>;
+
+    //! Types of work advertised by advertise_new_work()
+    enum new_work_type {
+        work_spawned,
+        wakeup,
+        work_enqueued
+    };
+
+    //! Constructor
+    arena ( market& m, unsigned max_num_workers, unsigned num_reserved_slots, unsigned priority_level);
+
+    //! Allocate an instance of arena.
+    static arena& allocate_arena( market& m, unsigned num_slots, unsigned num_reserved_slots,
+                                  unsigned priority_level );
+
+    static int unsigned num_arena_slots ( unsigned num_slots ) {
+        return max(2u, num_slots);
+    }
+
+    static int allocation_size ( unsigned num_slots ) {
+        return sizeof(base_type) + num_slots * (sizeof(mail_outbox) + sizeof(arena_slot) + sizeof(task_dispatcher));
+    }
+
+    //! Get reference to mailbox corresponding to given slot_id
+    mail_outbox& mailbox( d1::slot_id slot ) {
+        __TBB_ASSERT( slot != d1::no_slot, "affinity should be specified" );
+
+        return reinterpret_cast<mail_outbox*>(this)[-(int)(slot+1)]; // cast to 'int' is redundant but left for readability
+    }
+
+    //! Completes arena shutdown, destructs and deallocates it.
+    void free_arena ();
+
+    //! No tasks to steal since last snapshot was taken
+    static const pool_state_t SNAPSHOT_EMPTY = 0;
+
+    //! At least one task has been offered for stealing since the last snapshot started
+    static const pool_state_t SNAPSHOT_FULL = pool_state_t(-1);
+
+    //! The number of least significant bits for external references
+    static const unsigned ref_external_bits = 12; // up to 4095 external and 1M workers
+
+    //! Reference increment values for externals and workers
+    static const unsigned ref_external = 1;
+    static const unsigned ref_worker   = 1 << ref_external_bits;
+
+    //! No tasks to steal or snapshot is being taken.
+    static bool is_busy_or_empty( pool_state_t s ) { return s < SNAPSHOT_FULL; }
+
+    //! The number of workers active in the arena.
+    unsigned num_workers_active() const {
+        return my_references.load(std::memory_order_acquire) >> ref_external_bits;
+    }
+
+    //! Check if the recall is requested by the market.
+    bool is_recall_requested() const {
+        return num_workers_active() > my_num_workers_allotted.load(std::memory_order_relaxed);
+    }
+
+    //! If necessary, raise a flag that there is new job in arena.
+    template<arena::new_work_type work_type> void advertise_new_work();
+
+    //! Attempts to steal a task from a randomly chosen arena slot
+    d1::task* steal_task(unsigned arena_index, FastRandom& frnd, execution_data_ext& ed, isolation_type isolation);
+
+    //! Get a task from a global starvation resistant queue
+    template<task_stream_accessor_type accessor>
+    d1::task* get_stream_task(task_stream<accessor>& stream, unsigned& hint);
+
+#if __TBB_PREVIEW_CRITICAL_TASKS
+    //! Tries to find a critical task in global critical task stream
+    d1::task* get_critical_task(unsigned& hint, isolation_type isolation);
+#endif
+
+    //! Check if there is job anywhere in arena.
+    /** Return true if no job or if arena is being cleaned up. */
+    bool is_out_of_work();
+
+    //! enqueue a task into starvation-resistance queue
+    void enqueue_task(d1::task&, d1::task_group_context&, thread_data&);
+
+    //! Registers the worker with the arena and enters TBB scheduler dispatch loop
+    void process(thread_data&);
+
+    //! Notification that the thread leaves its arena
+    template<unsigned ref_param>
+    inline void on_thread_leaving ( );
+
+    //! Check for the presence of enqueued tasks at all priority levels
+    bool has_enqueued_tasks();
+
+    static const std::size_t out_of_arena = ~size_t(0);
+    //! Tries to occupy a slot in the arena. On success, returns the slot index; if no slot is available, returns out_of_arena.
+    template <bool as_worker>
+    std::size_t occupy_free_slot(thread_data&);
+    //! Tries to occupy a slot in the specified range.
+    std::size_t occupy_free_slot_in_range(thread_data& tls, std::size_t lower, std::size_t upper);
+
+    std::uintptr_t calculate_stealing_threshold();
+
+    /** Must be the last data field */
+    arena_slot my_slots[1];
+}; // class arena
+
+template<unsigned ref_param>
+inline void arena::on_thread_leaving ( ) {
+    //
+    // Implementation of arena destruction synchronization logic contained various
+    // bugs/flaws at the different stages of its evolution, so below is a detailed
+    // description of the issues taken into consideration in the framework of the
+    // current design.
+    //
+    // In case of using fire-and-forget tasks (scheduled via task::enqueue())
+    // external thread is allowed to leave its arena before all its work is executed,
+    // and market may temporarily revoke all workers from this arena. Since revoked
+    // workers never attempt to reset arena state to EMPTY and cancel its request
+    // to RML for threads, the arena object is destroyed only when both the last
+    // thread is leaving it and arena's state is EMPTY (that is its external thread
+    // left and it does not contain any work).
+    // Thus resetting arena to EMPTY state (as earlier TBB versions did) should not
+    // be done here (or anywhere else in the external thread to that matter); doing so
+    // can result either in arena's premature destruction (at least without
+    // additional costly checks in workers) or in unnecessary arena state changes
+    // (and ensuing workers migration).
+    //
+    // A worker that checks for work presence and transitions arena to the EMPTY
+    // state (in snapshot taking procedure arena::is_out_of_work()) updates
+    // arena::my_pool_state first and only then arena::my_num_workers_requested.
+    // So the check for work absence must be done against the latter field.
+    //
+    // In a time window between decrementing the active threads count and checking
+    // if there is an outstanding request for workers. New worker thread may arrive,
+    // finish remaining work, set arena state to empty, and leave decrementing its
+    // refcount and destroying. Then the current thread will destroy the arena
+    // the second time. To preclude it a local copy of the outstanding request
+    // value can be stored before decrementing active threads count.
+    //
+    // But this technique may cause two other problem. When the stored request is
+    // zero, it is possible that arena still has threads and they can generate new
+    // tasks and thus re-establish non-zero requests. Then all the threads can be
+    // revoked (as described above) leaving this thread the last one, and causing
+    // it to destroy non-empty arena.
+    //
+    // The other problem takes place when the stored request is non-zero. Another
+    // thread may complete the work, set arena state to empty, and leave without
+    // arena destruction before this thread decrements the refcount. This thread
+    // cannot destroy the arena either. Thus the arena may be "orphaned".
+    //
+    // In both cases we cannot dereference arena pointer after the refcount is
+    // decremented, as our arena may already be destroyed.
+    //
+    // If this is the external thread, the market is protected by refcount to it.
+    // In case of workers market's liveness is ensured by the RML connection
+    // rundown protocol, according to which the client (i.e. the market) lives
+    // until RML server notifies it about connection termination, and this
+    // notification is fired only after all workers return into RML.
+    //
+    // Thus if we decremented refcount to zero we ask the market to check arena
+    // state (including the fact if it is alive) under the lock.
+    //
+    std::uintptr_t aba_epoch = my_aba_epoch;
+    unsigned priority_level = my_priority_level;
+    market* m = my_market;
+    __TBB_ASSERT(my_references.load(std::memory_order_relaxed) >= ref_param, "broken arena reference counter");
+#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY
+    // When there is no workers someone must free arena, as
+    // without workers, no one calls is_out_of_work().
+    // Skip workerless arenas because they have no demand for workers.
+    // TODO: consider more strict conditions for the cleanup,
+    // because it can create the demand of workers,
+    // but the arena can be already empty (and so ready for destroying)
+    // TODO: Fix the race: while we check soft limit and it might be changed.
+    if( ref_param==ref_external && my_num_slots != my_num_reserved_slots
+        && 0 == m->my_num_workers_soft_limit.load(std::memory_order_relaxed) &&
+        !my_global_concurrency_mode.load(std::memory_order_relaxed) ) {
+        is_out_of_work();
+        // We expect, that in worst case it's enough to have num_priority_levels-1
+        // calls to restore priorities and yet another is_out_of_work() to conform
+        // that no work was found. But as market::set_active_num_workers() can be called
+        // concurrently, can't guarantee last is_out_of_work() return true.
+    }
+#endif
+    if ( (my_references -= ref_param ) == 0 )
+        m->try_destroy_arena( this, aba_epoch, priority_level );
+}
+
+template<arena::new_work_type work_type>
+void arena::advertise_new_work() {
+    auto is_related_arena = [&] (extended_context context) {
+        return this == context.my_arena_addr;
+    };
+
+    if( work_type == work_enqueued ) {
+        atomic_fence(std::memory_order_seq_cst);
+#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY
+        if ( my_market->my_num_workers_soft_limit.load(std::memory_order_acquire) == 0 &&
+            my_global_concurrency_mode.load(std::memory_order_acquire) == false )
+            my_market->enable_mandatory_concurrency(this);
+
+        if (my_max_num_workers == 0 && my_num_reserved_slots == 1 && my_local_concurrency_flag.test_and_set()) {
+            my_market->adjust_demand(*this, /* delta = */ 1, /* mandatory = */ true);
+        }
+#endif /* __TBB_ENQUEUE_ENFORCED_CONCURRENCY */
+        // Local memory fence here and below is required to avoid missed wakeups; see the comment below.
+        // Starvation resistant tasks require concurrency, so missed wakeups are unacceptable.
+    }
+    else if( work_type == wakeup ) {
+        atomic_fence(std::memory_order_seq_cst);
+    }
+
+    // Double-check idiom that, in case of spawning, is deliberately sloppy about memory fences.
+    // Technically, to avoid missed wakeups, there should be a full memory fence between the point we
+    // released the task pool (i.e. spawned task) and read the arena's state.  However, adding such a
+    // fence might hurt overall performance more than it helps, because the fence would be executed
+    // on every task pool release, even when stealing does not occur.  Since TBB allows parallelism,
+    // but never promises parallelism, the missed wakeup is not a correctness problem.
+    pool_state_t snapshot = my_pool_state.load(std::memory_order_acquire);
+    if( is_busy_or_empty(snapshot) ) {
+        // Attempt to mark as full.  The compare_and_swap below is a little unusual because the
+        // result is compared to a value that can be different than the comparand argument.
+        pool_state_t expected_state = snapshot;
+        my_pool_state.compare_exchange_strong( expected_state, SNAPSHOT_FULL );
+        if( expected_state == SNAPSHOT_EMPTY ) {
+            if( snapshot != SNAPSHOT_EMPTY ) {
+                // This thread read "busy" into snapshot, and then another thread transitioned
+                // my_pool_state to "empty" in the meantime, which caused the compare_and_swap above
+                // to fail.  Attempt to transition my_pool_state from "empty" to "full".
+                expected_state = SNAPSHOT_EMPTY;
+                if( !my_pool_state.compare_exchange_strong( expected_state, SNAPSHOT_FULL ) ) {
+                    // Some other thread transitioned my_pool_state from "empty", and hence became
+                    // responsible for waking up workers.
+                    return;
+                }
+            }
+            // This thread transitioned pool from empty to full state, and thus is responsible for
+            // telling the market that there is work to do.
+#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY
+            if( work_type == work_spawned ) {
+                if ( my_global_concurrency_mode.load(std::memory_order_acquire) == true )
+                    my_market->mandatory_concurrency_disable( this );
+            }
+#endif /* __TBB_ENQUEUE_ENFORCED_CONCURRENCY */
+            // TODO: investigate adjusting of arena's demand by a single worker.
+            my_market->adjust_demand(*this, my_max_num_workers, /* mandatory = */ false);
+
+            // Notify all sleeping threads that work has appeared in the arena.
+            my_market->get_wait_list().notify(is_related_arena);
+        }
+    }
+}
+
+inline d1::task* arena::steal_task(unsigned arena_index, FastRandom& frnd, execution_data_ext& ed, isolation_type isolation) {
+    auto slot_num_limit = my_limit.load(std::memory_order_relaxed);
+    if (slot_num_limit == 1) {
+        // No slots to steal from
+        return nullptr;
+    }
+    // Try to steal a task from a random victim.
+    std::size_t k = frnd.get() % (slot_num_limit - 1);
+    // The following condition excludes the external thread that might have
+    // already taken our previous place in the arena from the list .
+    // of potential victims. But since such a situation can take
+    // place only in case of significant oversubscription, keeping
+    // the checks simple seems to be preferable to complicating the code.
+    if (k >= arena_index) {
+        ++k; // Adjusts random distribution to exclude self
+    }
+    arena_slot* victim = &my_slots[k];
+    d1::task **pool = victim->task_pool.load(std::memory_order_relaxed);
+    d1::task *t = nullptr;
+    if (pool == EmptyTaskPool || !(t = victim->steal_task(*this, isolation))) {
+        return nullptr;
+    }
+    if (task_accessor::is_proxy_task(*t)) {
+        task_proxy &tp = *(task_proxy*)t;
+        d1::slot_id slot = tp.slot;
+        t = tp.extract_task<task_proxy::pool_bit>();
+        if (!t) {
+            // Proxy was empty, so it's our responsibility to free it
+            tp.allocator.delete_object(&tp, ed);
+            return nullptr;
+        }
+        // Note affinity is called for any stealed task (proxy or general)
+        ed.affinity_slot = slot;
+    } else {
+        // Note affinity is called for any stealed task (proxy or general)
+        ed.affinity_slot = d1::any_slot;
+    }
+    // Update task owner thread id to identify stealing
+    ed.original_slot = k;
+    return t;
+}
+
+template<task_stream_accessor_type accessor>
+inline d1::task* arena::get_stream_task(task_stream<accessor>& stream, unsigned& hint) {
+    if (stream.empty())
+        return nullptr;
+    return stream.pop(subsequent_lane_selector(hint));
+}
+
+#if __TBB_PREVIEW_CRITICAL_TASKS
+// Retrieves critical task respecting isolation level, if provided. The rule is:
+// 1) If no outer critical task and no isolation => take any critical task
+// 2) If working on an outer critical task and no isolation => cannot take any critical task
+// 3) If no outer critical task but isolated => respect isolation
+// 4) If working on an outer critical task and isolated => respect isolation
+// Hint is used to keep some LIFO-ness, start search with the lane that was used during push operation.
+inline d1::task* arena::get_critical_task(unsigned& hint, isolation_type isolation) {
+    if (my_critical_task_stream.empty())
+        return nullptr;
+
+    if ( isolation != no_isolation ) {
+        return my_critical_task_stream.pop_specific( hint, isolation );
+    } else {
+        return my_critical_task_stream.pop(preceding_lane_selector(hint));
+    }
+}
+#endif // __TBB_PREVIEW_CRITICAL_TASKS
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_arena_H */
diff --git a/contrib/libs/tbb/src/tbb/arena_slot.cpp b/contrib/libs/tbb/src/tbb/arena_slot.cpp
new file mode 100644
index 0000000000..72706b3de5
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/arena_slot.cpp
@@ -0,0 +1,219 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "arena_slot.h"
+#include "arena.h"
+#include "thread_data.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//------------------------------------------------------------------------
+// Arena Slot
+//------------------------------------------------------------------------
+d1::task* arena_slot::get_task_impl(size_t T, execution_data_ext& ed, bool& tasks_omitted, isolation_type isolation) {
+    __TBB_ASSERT(tail.load(std::memory_order_relaxed) <= T || is_local_task_pool_quiescent(),
+            "Is it safe to get a task at position T?");
+
+    d1::task* result = task_pool_ptr[T];
+    __TBB_ASSERT(!is_poisoned( result ), "The poisoned task is going to be processed");
+
+    if (!result) {
+        return nullptr;
+    }
+    bool omit = isolation != no_isolation && isolation != task_accessor::isolation(*result);
+    if (!omit && !task_accessor::is_proxy_task(*result)) {
+        return result;
+    } else if (omit) {
+        tasks_omitted = true;
+        return nullptr;
+    }
+
+    task_proxy& tp = static_cast<task_proxy&>(*result);
+    d1::slot_id aff_id = tp.slot;
+    if ( d1::task *t = tp.extract_task<task_proxy::pool_bit>() ) {
+        ed.affinity_slot = aff_id;
+        return t;
+    }
+    // Proxy was empty, so it's our responsibility to free it
+    tp.allocator.delete_object(&tp, ed);
+
+    if ( tasks_omitted ) {
+        task_pool_ptr[T] = nullptr;
+    }
+    return nullptr;
+}
+
+d1::task* arena_slot::get_task(execution_data_ext& ed, isolation_type isolation) {
+    __TBB_ASSERT(is_task_pool_published(), nullptr);
+    // The current task position in the task pool.
+    std::size_t T0 = tail.load(std::memory_order_relaxed);
+    // The bounds of available tasks in the task pool. H0 is only used when the head bound is reached.
+    std::size_t H0 = (std::size_t)-1, T = T0;
+    d1::task* result = nullptr;
+    bool task_pool_empty = false;
+    bool tasks_omitted = false;
+    do {
+        __TBB_ASSERT( !result, nullptr );
+        // The full fence is required to sync the store of `tail` with the load of `head` (write-read barrier)
+        T = --tail;
+        // The acquire load of head is required to guarantee consistency of our task pool
+        // when a thief rolls back the head.
+        if ( (std::intptr_t)( head.load(std::memory_order_acquire) ) > (std::intptr_t)T ) {
+            acquire_task_pool();
+            H0 = head.load(std::memory_order_relaxed);
+            if ( (std::intptr_t)H0 > (std::intptr_t)T ) {
+                // The thief has not backed off - nothing to grab.
+                __TBB_ASSERT( H0 == head.load(std::memory_order_relaxed)
+                    && T == tail.load(std::memory_order_relaxed)
+                    && H0 == T + 1, "victim/thief arbitration algorithm failure" );
+                reset_task_pool_and_leave();
+                // No tasks in the task pool.
+                task_pool_empty = true;
+                break;
+            } else if ( H0 == T ) {
+                // There is only one task in the task pool.
+                reset_task_pool_and_leave();
+                task_pool_empty = true;
+            } else {
+                // Release task pool if there are still some tasks.
+                // After the release, the tail will be less than T, thus a thief
+                // will not attempt to get a task at position T.
+                release_task_pool();
+            }
+        }
+        result = get_task_impl( T, ed, tasks_omitted, isolation );
+        if ( result ) {
+            poison_pointer( task_pool_ptr[T] );
+            break;
+        } else if ( !tasks_omitted ) {
+            poison_pointer( task_pool_ptr[T] );
+            __TBB_ASSERT( T0 == T+1, nullptr );
+            T0 = T;
+        }
+    } while ( !result && !task_pool_empty );
+
+    if ( tasks_omitted ) {
+        if ( task_pool_empty ) {
+            // All tasks have been checked. The task pool should be  in reset state.
+            // We just restore the bounds for the available tasks.
+            // TODO: Does it have sense to move them to the beginning of the task pool?
+            __TBB_ASSERT( is_quiescent_local_task_pool_reset(), nullptr );
+            if ( result ) {
+                // If we have a task, it should be at H0 position.
+                __TBB_ASSERT( H0 == T, nullptr );
+                ++H0;
+            }
+            __TBB_ASSERT( H0 <= T0, nullptr );
+            if ( H0 < T0 ) {
+                // Restore the task pool if there are some tasks.
+                head.store(H0, std::memory_order_relaxed);
+                tail.store(T0, std::memory_order_relaxed);
+                // The release fence is used in publish_task_pool.
+                publish_task_pool();
+                // Synchronize with snapshot as we published some tasks.
+                ed.task_disp->m_thread_data->my_arena->advertise_new_work<arena::wakeup>();
+            }
+        } else {
+            // A task has been obtained. We need to make a hole in position T.
+            __TBB_ASSERT( is_task_pool_published(), nullptr );
+            __TBB_ASSERT( result, nullptr );
+            task_pool_ptr[T] = nullptr;
+            tail.store(T0, std::memory_order_release);
+            // Synchronize with snapshot as we published some tasks.
+            // TODO: consider some approach not to call wakeup for each time. E.g. check if the tail reached the head.
+            ed.task_disp->m_thread_data->my_arena->advertise_new_work<arena::wakeup>();
+        }
+    }
+
+    __TBB_ASSERT( (std::intptr_t)tail.load(std::memory_order_relaxed) >= 0, nullptr );
+    __TBB_ASSERT( result || tasks_omitted || is_quiescent_local_task_pool_reset(), nullptr );
+    return result;
+}
+
+d1::task* arena_slot::steal_task(arena& a, isolation_type isolation) {
+    d1::task** victim_pool = lock_task_pool();
+    if (!victim_pool) {
+        return nullptr;
+    }
+    d1::task* result = nullptr;
+    std::size_t H = head.load(std::memory_order_relaxed); // mirror
+    std::size_t H0 = H;
+    bool tasks_omitted = false;
+    do {
+        // The full fence is required to sync the store of `head` with the load of `tail` (write-read barrier)
+        H = ++head;
+        // The acquire load of tail is required to guarantee consistency of victim_pool
+        // because the owner synchronizes task spawning via tail.
+        if ((std::intptr_t)H > (std::intptr_t)(tail.load(std::memory_order_acquire))) {
+            // Stealing attempt failed, deque contents has not been changed by us
+            head.store( /*dead: H = */ H0, std::memory_order_relaxed );
+            __TBB_ASSERT( !result, nullptr );
+            goto unlock;
+        }
+        result = victim_pool[H-1];
+        __TBB_ASSERT( !is_poisoned( result ), nullptr );
+
+        if (result) {
+            if (isolation == no_isolation || isolation == task_accessor::isolation(*result)) {
+                if (!task_accessor::is_proxy_task(*result)) {
+                    break;
+                }
+                task_proxy& tp = *static_cast<task_proxy*>(result);
+                // If mailed task is likely to be grabbed by its destination thread, skip it.
+                if ( !(task_proxy::is_shared( tp.task_and_tag ) && tp.outbox->recipient_is_idle()) ) {
+                    break;
+                }
+            }
+            // The task cannot be executed either due to isolation or proxy constraints.
+            result = nullptr;
+            tasks_omitted = true;
+        } else if (!tasks_omitted) {
+            // Cleanup the task pool from holes until a task is skipped.
+            __TBB_ASSERT( H0 == H-1, nullptr );
+            poison_pointer( victim_pool[H0] );
+            H0 = H;
+        }
+    } while (!result);
+    __TBB_ASSERT( result, nullptr );
+
+    // emit "task was consumed" signal
+    poison_pointer( victim_pool[H-1] );
+    if (tasks_omitted) {
+        // Some proxies in the task pool have been omitted. Set the stolen task to nullptr.
+        victim_pool[H-1] = nullptr;
+        // The release store synchronizes the victim_pool update(the store of nullptr).
+        head.store( /*dead: H = */ H0, std::memory_order_release );
+    }
+unlock:
+    unlock_task_pool(victim_pool);
+
+#if __TBB_PREFETCHING
+    __TBB_cl_evict(&victim_slot.head);
+    __TBB_cl_evict(&victim_slot.tail);
+#endif
+    if (tasks_omitted) {
+        // Synchronize with snapshot as the head and tail can be bumped which can falsely trigger EMPTY state
+        a.advertise_new_work<arena::wakeup>();
+    }
+    return result;
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
diff --git a/contrib/libs/tbb/src/tbb/arena_slot.h b/contrib/libs/tbb/src/tbb/arena_slot.h
new file mode 100644
index 0000000000..83d61d2197
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/arena_slot.h
@@ -0,0 +1,409 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_arena_slot_H
+#define _TBB_arena_slot_H
+
+#include "oneapi/tbb/detail/_config.h"
+#include "oneapi/tbb/detail/_utils.h"
+#include "oneapi/tbb/detail/_template_helpers.h"
+#include "oneapi/tbb/detail/_task.h"
+
+#include "oneapi/tbb/cache_aligned_allocator.h"
+
+#include "misc.h"
+#include "mailbox.h"
+#include "scheduler_common.h"
+
+#include <atomic>
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class arena;
+class task_group_context;
+
+//--------------------------------------------------------------------------------------------------------
+// Arena Slot
+//--------------------------------------------------------------------------------------------------------
+
+static d1::task** const EmptyTaskPool  = nullptr;
+static d1::task** const LockedTaskPool = reinterpret_cast<d1::task**>(~std::intptr_t(0));
+
+struct alignas(max_nfs_size) arena_slot_shared_state {
+    //! Scheduler of the thread attached to the slot
+    /** Marks the slot as busy, and is used to iterate through the schedulers belonging to this arena **/
+    std::atomic<bool> my_is_occupied;
+
+    // Synchronization of access to Task pool
+    /** Also is used to specify if the slot is empty or locked:
+         0 - empty
+        -1 - locked **/
+    std::atomic<d1::task**> task_pool;
+
+    //! Index of the first ready task in the deque.
+    /** Modified by thieves, and by the owner during compaction/reallocation **/
+    std::atomic<std::size_t> head;
+};
+
+struct alignas(max_nfs_size) arena_slot_private_state {
+    //! Hint provided for operations with the container of starvation-resistant tasks.
+    /** Modified by the owner thread (during these operations). **/
+    unsigned hint_for_fifo_stream;
+
+#if __TBB_PREVIEW_CRITICAL_TASKS
+    //! Similar to 'hint_for_fifo_stream' but for critical tasks.
+    unsigned hint_for_critical_stream;
+#endif
+
+    //! Similar to 'hint_for_fifo_stream' but for the resume tasks.
+    unsigned hint_for_resume_stream;
+
+    //! Index of the element following the last ready task in the deque.
+    /** Modified by the owner thread. **/
+    std::atomic<std::size_t> tail;
+
+    //! Capacity of the primary task pool (number of elements - pointers to task).
+    std::size_t my_task_pool_size;
+
+    //! Task pool of the scheduler that owns this slot
+    // TODO: previously was task**__TBB_atomic, but seems like not accessed on other thread
+    d1::task** task_pool_ptr;
+};
+
+class arena_slot : private arena_slot_shared_state, private arena_slot_private_state {
+    friend class arena;
+    friend class outermost_worker_waiter;
+    friend class task_dispatcher;
+    friend class thread_data;
+    friend class nested_arena_context;
+
+    //! The original task dispather associated with this slot
+    task_dispatcher* my_default_task_dispatcher;
+
+#if TBB_USE_ASSERT
+    void fill_with_canary_pattern ( std::size_t first, std::size_t last ) {
+        for ( std::size_t i = first; i < last; ++i )
+            poison_pointer(task_pool_ptr[i]);
+    }
+#else
+    void fill_with_canary_pattern ( size_t, std::size_t ) {}
+#endif /* TBB_USE_ASSERT */
+
+    static constexpr std::size_t min_task_pool_size = 64;
+
+    void allocate_task_pool( std::size_t n ) {
+        std::size_t byte_size = ((n * sizeof(d1::task*) + max_nfs_size - 1) / max_nfs_size) * max_nfs_size;
+        my_task_pool_size = byte_size / sizeof(d1::task*);
+        task_pool_ptr = (d1::task**)cache_aligned_allocate(byte_size);
+        // No need to clear the fresh deque since valid items are designated by the head and tail members.
+        // But fill it with a canary pattern in the high vigilance debug mode.
+        fill_with_canary_pattern( 0, my_task_pool_size );
+    }
+
+public:
+    //! Deallocate task pool that was allocated by means of allocate_task_pool.
+    void free_task_pool( ) {
+        // TODO: understand the assertion and modify
+        // __TBB_ASSERT( !task_pool /* TODO: == EmptyTaskPool */, NULL);
+        if( task_pool_ptr ) {
+           __TBB_ASSERT( my_task_pool_size, NULL);
+           cache_aligned_deallocate( task_pool_ptr );
+           task_pool_ptr = NULL;
+           my_task_pool_size = 0;
+        }
+    }
+
+    //! Get a task from the local pool.
+    /** Called only by the pool owner.
+        Returns the pointer to the task or NULL if a suitable task is not found.
+        Resets the pool if it is empty. **/
+    d1::task* get_task(execution_data_ext&, isolation_type);
+
+    //! Steal task from slot's ready pool
+    d1::task* steal_task(arena&, isolation_type);
+
+    //! Some thread is now the owner of this slot
+    void occupy() {
+        __TBB_ASSERT(!my_is_occupied.load(std::memory_order_relaxed), nullptr);
+        my_is_occupied.store(true, std::memory_order_release);
+    }
+
+    //! Try to occupy the slot
+    bool try_occupy() {
+        return !is_occupied() && my_is_occupied.exchange(true) == false;
+    }
+
+    //! Some thread is now the owner of this slot
+    void release() {
+        __TBB_ASSERT(my_is_occupied.load(std::memory_order_relaxed), nullptr);
+        my_is_occupied.store(false, std::memory_order_release);
+    }
+
+    //! Spawn newly created tasks
+    void spawn(d1::task& t) {
+        std::size_t T = prepare_task_pool(1);
+        __TBB_ASSERT(is_poisoned(task_pool_ptr[T]), NULL);
+        task_pool_ptr[T] = &t;
+        commit_spawned_tasks(T + 1);
+        if (!is_task_pool_published()) {
+            publish_task_pool();
+        }
+    }
+
+    bool is_task_pool_published() const {
+        return task_pool.load(std::memory_order_relaxed) != EmptyTaskPool;
+    }
+
+    bool is_occupied() const {
+        return my_is_occupied.load(std::memory_order_relaxed);
+    }
+
+    task_dispatcher& default_task_dispatcher() {
+        __TBB_ASSERT(my_default_task_dispatcher != nullptr, nullptr);
+        return *my_default_task_dispatcher;
+    }
+
+    void init_task_streams(unsigned h) {
+        hint_for_fifo_stream = h;
+#if __TBB_RESUMABLE_TASKS
+        hint_for_resume_stream = h;
+#endif
+#if __TBB_PREVIEW_CRITICAL_TASKS
+        hint_for_critical_stream = h;
+#endif
+    }
+
+#if __TBB_PREVIEW_CRITICAL_TASKS
+    unsigned& critical_hint() {
+        return hint_for_critical_stream;
+    }
+#endif
+private:
+    //! Get a task from the local pool at specified location T.
+    /** Returns the pointer to the task or NULL if the task cannot be executed,
+        e.g. proxy has been deallocated or isolation constraint is not met.
+        tasks_omitted tells if some tasks have been omitted.
+        Called only by the pool owner. The caller should guarantee that the
+        position T is not available for a thief. **/
+    d1::task* get_task_impl(size_t T, execution_data_ext& ed, bool& tasks_omitted, isolation_type isolation);
+
+    //! Makes sure that the task pool can accommodate at least n more elements
+    /** If necessary relocates existing task pointers or grows the ready task deque.
+     *  Returns (possible updated) tail index (not accounting for n). **/
+    std::size_t prepare_task_pool(std::size_t num_tasks) {
+        std::size_t T = tail.load(std::memory_order_relaxed); // mirror
+        if ( T + num_tasks <= my_task_pool_size ) {
+            return T;
+        }
+
+        std::size_t new_size = num_tasks;
+        if ( !my_task_pool_size ) {
+            __TBB_ASSERT( !is_task_pool_published() && is_quiescent_local_task_pool_reset(), NULL );
+            __TBB_ASSERT( !task_pool_ptr, NULL );
+            if ( num_tasks < min_task_pool_size ) new_size = min_task_pool_size;
+            allocate_task_pool( new_size );
+            return 0;
+        }
+        acquire_task_pool();
+        std::size_t H =  head.load(std::memory_order_relaxed); // mirror
+        d1::task** new_task_pool = task_pool_ptr;;
+        __TBB_ASSERT( my_task_pool_size >= min_task_pool_size, NULL );
+        // Count not skipped tasks. Consider using std::count_if.
+        for ( std::size_t i = H; i < T; ++i )
+            if ( new_task_pool[i] ) ++new_size;
+        // If the free space at the beginning of the task pool is too short, we
+        // are likely facing a pathological single-producer-multiple-consumers
+        // scenario, and thus it's better to expand the task pool
+        bool allocate = new_size > my_task_pool_size - min_task_pool_size/4;
+        if ( allocate ) {
+            // Grow task pool. As this operation is rare, and its cost is asymptotically
+            // amortizable, we can tolerate new task pool allocation done under the lock.
+            if ( new_size < 2 * my_task_pool_size )
+                new_size = 2 * my_task_pool_size;
+            allocate_task_pool( new_size ); // updates my_task_pool_size
+        }
+        // Filter out skipped tasks. Consider using std::copy_if.
+        std::size_t T1 = 0;
+        for ( std::size_t i = H; i < T; ++i ) {
+            if ( new_task_pool[i] ) {
+                task_pool_ptr[T1++] = new_task_pool[i];
+            }
+        }
+        // Deallocate the previous task pool if a new one has been allocated.
+        if ( allocate )
+            cache_aligned_deallocate( new_task_pool );
+        else
+            fill_with_canary_pattern( T1, tail );
+        // Publish the new state.
+        commit_relocated_tasks( T1 );
+        // assert_task_pool_valid();
+        return T1;
+    }
+
+    //! Makes newly spawned tasks visible to thieves
+    void commit_spawned_tasks(std::size_t new_tail) {
+        __TBB_ASSERT (new_tail <= my_task_pool_size, "task deque end was overwritten");
+        // emit "task was released" signal
+        // Release fence is necessary to make sure that previously stored task pointers
+        // are visible to thieves.
+        tail.store(new_tail, std::memory_order_release);
+    }
+
+    //! Used by workers to enter the task pool
+    /** Does not lock the task pool in case if arena slot has been successfully grabbed. **/
+    void publish_task_pool() {
+        __TBB_ASSERT ( task_pool == EmptyTaskPool, "someone else grabbed my arena slot?" );
+        __TBB_ASSERT ( head.load(std::memory_order_relaxed) < tail.load(std::memory_order_relaxed),
+                "entering arena without tasks to share" );
+        // Release signal on behalf of previously spawned tasks (when this thread was not in arena yet)
+        task_pool.store(task_pool_ptr, std::memory_order_release );
+    }
+
+    //! Locks the local task pool
+    /** Garbles task_pool for the duration of the lock. Requires correctly set task_pool_ptr.
+        ATTENTION: This method is mostly the same as generic_scheduler::lock_task_pool(), with
+        a little different logic of slot state checks (slot is either locked or points
+        to our task pool). Thus if either of them is changed, consider changing the counterpart as well. **/
+    void acquire_task_pool() {
+        if (!is_task_pool_published()) {
+            return; // we are not in arena - nothing to lock
+        }
+        bool sync_prepare_done = false;
+        for( atomic_backoff b;;b.pause() ) {
+#if TBB_USE_ASSERT
+            // Local copy of the arena slot task pool pointer is necessary for the next
+            // assertion to work correctly to exclude asynchronous state transition effect.
+            d1::task** tp = task_pool.load(std::memory_order_relaxed);
+            __TBB_ASSERT( tp == LockedTaskPool || tp == task_pool_ptr, "slot ownership corrupt?" );
+#endif
+            d1::task** expected = task_pool_ptr;
+            if( task_pool.load(std::memory_order_relaxed) != LockedTaskPool &&
+                task_pool.compare_exchange_strong(expected, LockedTaskPool ) ) {
+                // We acquired our own slot
+                break;
+            } else if( !sync_prepare_done ) {
+                // Start waiting
+                sync_prepare_done = true;
+            }
+            // Someone else acquired a lock, so pause and do exponential backoff.
+        }
+        __TBB_ASSERT( task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "not really acquired task pool" );
+    }
+
+    //! Unlocks the local task pool
+    /** Restores task_pool munged by acquire_task_pool. Requires
+        correctly set task_pool_ptr. **/
+    void release_task_pool() {
+        if ( !(task_pool.load(std::memory_order_relaxed) != EmptyTaskPool) )
+            return; // we are not in arena - nothing to unlock
+        __TBB_ASSERT( task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "arena slot is not locked" );
+        task_pool.store( task_pool_ptr, std::memory_order_release );
+    }
+
+    //! Locks victim's task pool, and returns pointer to it. The pointer can be NULL.
+    /** Garbles victim_arena_slot->task_pool for the duration of the lock. **/
+    d1::task** lock_task_pool() {
+        d1::task** victim_task_pool;
+        for ( atomic_backoff backoff;; /*backoff pause embedded in the loop*/) {
+            victim_task_pool = task_pool.load(std::memory_order_relaxed);
+            // Microbenchmarks demonstrated that aborting stealing attempt when the
+            // victim's task pool is locked degrade performance.
+            // NOTE: Do not use comparison of head and tail indices to check for
+            // the presence of work in the victim's task pool, as they may give
+            // incorrect indication because of task pool relocations and resizes.
+            if (victim_task_pool == EmptyTaskPool) {
+                break;
+            }
+            d1::task** expected = victim_task_pool;
+            if (victim_task_pool != LockedTaskPool && task_pool.compare_exchange_strong(expected, LockedTaskPool) ) {
+                // We've locked victim's task pool
+                break;
+            } 
+            // Someone else acquired a lock, so pause and do exponential backoff.
+            backoff.pause();
+        }
+        __TBB_ASSERT(victim_task_pool == EmptyTaskPool ||
+                    (task_pool.load(std::memory_order_relaxed) == LockedTaskPool &&
+                    victim_task_pool != LockedTaskPool), "not really locked victim's task pool?");
+        return victim_task_pool;
+    }
+
+    //! Unlocks victim's task pool
+    /** Restores victim_arena_slot->task_pool munged by lock_task_pool. **/
+    void unlock_task_pool(d1::task** victim_task_pool) {
+        __TBB_ASSERT(task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "victim arena slot is not locked");
+        __TBB_ASSERT(victim_task_pool != LockedTaskPool, NULL);
+        task_pool.store(victim_task_pool, std::memory_order_release);
+    }
+
+#if TBB_USE_ASSERT
+    bool is_local_task_pool_quiescent() const {
+        d1::task** tp = task_pool.load(std::memory_order_relaxed);
+        return tp == EmptyTaskPool || tp == LockedTaskPool;
+    }
+
+    bool is_quiescent_local_task_pool_empty() const {
+        __TBB_ASSERT(is_local_task_pool_quiescent(), "Task pool is not quiescent");
+        return head.load(std::memory_order_relaxed) == tail.load(std::memory_order_relaxed);
+    }
+
+    bool is_quiescent_local_task_pool_reset() const {
+        __TBB_ASSERT(is_local_task_pool_quiescent(), "Task pool is not quiescent");
+        return head.load(std::memory_order_relaxed) == 0 && tail.load(std::memory_order_relaxed) == 0;
+    }
+#endif // TBB_USE_ASSERT
+
+    //! Leave the task pool
+    /** Leaving task pool automatically releases the task pool if it is locked. **/
+    void leave_task_pool() {
+        __TBB_ASSERT(is_task_pool_published(), "Not in arena");
+        // Do not reset my_arena_index. It will be used to (attempt to) re-acquire the slot next time
+        __TBB_ASSERT(task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "Task pool must be locked when leaving arena");
+        __TBB_ASSERT(is_quiescent_local_task_pool_empty(), "Cannot leave arena when the task pool is not empty");
+        // No release fence is necessary here as this assignment precludes external
+        // accesses to the local task pool when becomes visible. Thus it is harmless
+        // if it gets hoisted above preceding local bookkeeping manipulations.
+        task_pool.store(EmptyTaskPool, std::memory_order_relaxed);
+    }
+
+    //! Resets head and tail indices to 0, and leaves task pool
+    /** The task pool must be locked by the owner (via acquire_task_pool).**/
+    void reset_task_pool_and_leave() {
+        __TBB_ASSERT(task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "Task pool must be locked when resetting task pool");
+        tail.store(0, std::memory_order_relaxed);
+        head.store(0, std::memory_order_relaxed);
+        leave_task_pool();
+    }
+
+    //! Makes relocated tasks visible to thieves and releases the local task pool.
+    /** Obviously, the task pool must be locked when calling this method. **/
+    void commit_relocated_tasks(std::size_t new_tail) {
+        __TBB_ASSERT(is_local_task_pool_quiescent(), "Task pool must be locked when calling commit_relocated_tasks()");
+        head.store(0, std::memory_order_relaxed);
+        // Tail is updated last to minimize probability of a thread making arena
+        // snapshot being misguided into thinking that this task pool is empty.
+        tail.store(new_tail, std::memory_order_release);
+        release_task_pool();
+    }
+};
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_arena_slot_H
diff --git a/contrib/libs/tbb/src/tbb/assert_impl.h b/contrib/libs/tbb/src/tbb/assert_impl.h
new file mode 100644
index 0000000000..7f411e06f7
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/assert_impl.h
@@ -0,0 +1,71 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_assert_impl_H
+#define __TBB_assert_impl_H
+
+#include "oneapi/tbb/detail/_config.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cstdarg>
+#if _MSC_VER && _DEBUG
+#include <crtdbg.h>
+#endif
+
+#include <mutex>
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+// TODO: consider extension for formatted error description string
+static void assertion_failure_impl(const char* filename, int line, const char* expression, const char* comment) {
+    std::fprintf(stderr, "Assertion %s failed on line %d of file %s\n", expression, line, filename);
+    if (comment) {
+        std::fprintf(stderr, "Detailed description: %s\n", comment);
+    }
+#if _MSC_VER && _DEBUG
+    if (1 == _CrtDbgReport(_CRT_ASSERT, filename, line, "tbb_debug.dll", "%s\r\n%s", expression, comment?comment:"")) {
+        _CrtDbgBreak();
+    }
+#else
+    std::fflush(stderr);
+    std::abort();
+#endif
+}
+
+void __TBB_EXPORTED_FUNC assertion_failure(const char* filename, int line, const char* expression, const char* comment) {
+    static std::once_flag flag;
+    std::call_once(flag, [&](){ assertion_failure_impl(filename, line, expression, comment); });
+}
+
+//! Report a runtime warning.
+void runtime_warning( const char* format, ... ) {
+    char str[1024]; std::memset(str, 0, 1024);
+    va_list args; va_start(args, format);
+    vsnprintf( str, 1024-1, format, args);
+    va_end(args);
+    fprintf(stderr, "TBB Warning: %s\n", str);
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_assert_impl_H
+
diff --git a/contrib/libs/tbb/src/tbb/co_context.h b/contrib/libs/tbb/src/tbb/co_context.h
new file mode 100644
index 0000000000..552dec356b
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/co_context.h
@@ -0,0 +1,222 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_co_context_H
+#define _TBB_co_context_H
+
+#include "oneapi/tbb/detail/_config.h"
+
+#if __TBB_RESUMABLE_TASKS
+
+#include <cstddef>
+#include <cstdint>
+
+#if _WIN32 || _WIN64
+#include <windows.h>
+#else
+// ucontext.h API is deprecated since macOS 10.6
+#if __APPLE__
+    #if __INTEL_COMPILER
+        #pragma warning(push)
+        #pragma warning(disable:1478)
+    #elif __clang__
+        #pragma clang diagnostic push
+        #pragma clang diagnostic ignored "-Wdeprecated-declarations"
+    #endif
+#endif // __APPLE__
+
+#include <ucontext.h>
+#include <sys/mman.h> // mprotect
+
+#include "governor.h" // default_page_size()
+
+#ifndef MAP_STACK
+// macOS* does not define MAP_STACK
+#define MAP_STACK 0
+#endif
+#ifndef MAP_ANONYMOUS
+// macOS* defines MAP_ANON, which is deprecated in Linux*.
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+#endif // _WIN32 || _WIN64
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if _WIN32 || _WIN64
+    typedef LPVOID coroutine_type;
+#else
+    struct coroutine_type {
+        coroutine_type() : my_context(), my_stack(), my_stack_size() {}
+        ucontext_t my_context;
+        void* my_stack;
+        std::size_t my_stack_size;
+    };
+#endif
+
+    // Forward declaration of the coroutine API.
+    void create_coroutine(coroutine_type& c, std::size_t stack_size, void* arg);
+    void current_coroutine(coroutine_type& c);
+    void swap_coroutine(coroutine_type& prev_coroutine, coroutine_type& new_coroutine);
+    void destroy_coroutine(coroutine_type& c);
+
+class co_context {
+    enum co_state {
+        co_invalid,
+        co_suspended,
+        co_executing,
+        co_destroyed
+    };
+    coroutine_type      my_coroutine;
+    co_state            my_state;
+
+public:
+    co_context(std::size_t stack_size, void* arg)
+        : my_state(stack_size ? co_suspended : co_executing)
+    {
+        if (stack_size) {
+            __TBB_ASSERT(arg != 0, nullptr);
+            create_coroutine(my_coroutine, stack_size, arg);
+        } else {
+            current_coroutine(my_coroutine);
+        }
+    }
+
+    ~co_context() {
+        __TBB_ASSERT(1 << my_state & (1 << co_suspended | 1 << co_executing), NULL);
+        if (my_state == co_suspended)
+            destroy_coroutine(my_coroutine);
+        my_state = co_destroyed;
+    }
+
+    void resume(co_context& target) {
+        // Do not create non-trivial objects on the stack of this function. They might never be destroyed.
+        __TBB_ASSERT(my_state == co_executing, NULL);
+        __TBB_ASSERT(target.my_state == co_suspended, NULL);
+
+        my_state = co_suspended;
+        target.my_state = co_executing;
+
+        // 'target' can reference an invalid object after swap_coroutine. Do not access it.
+        swap_coroutine(my_coroutine, target.my_coroutine);
+
+        __TBB_ASSERT(my_state == co_executing, NULL);
+    }
+};
+
+#if _WIN32 || _WIN64
+/* [[noreturn]] */ void __stdcall co_local_wait_for_all(void* arg) noexcept;
+#else
+/* [[noreturn]] */ void co_local_wait_for_all(void* arg) noexcept;
+#endif
+
+#if _WIN32 || _WIN64
+inline void create_coroutine(coroutine_type& c, std::size_t stack_size, void* arg) {
+    __TBB_ASSERT(arg, NULL);
+    c = CreateFiber(stack_size, co_local_wait_for_all, arg);
+    __TBB_ASSERT(c, NULL);
+}
+
+inline void current_coroutine(coroutine_type& c) {
+    c = IsThreadAFiber() ? GetCurrentFiber() :
+        ConvertThreadToFiberEx(nullptr, FIBER_FLAG_FLOAT_SWITCH);
+    __TBB_ASSERT(c, NULL);
+}
+
+inline void swap_coroutine(coroutine_type& prev_coroutine, coroutine_type& new_coroutine) {
+    if (!IsThreadAFiber()) {
+        ConvertThreadToFiberEx(nullptr, FIBER_FLAG_FLOAT_SWITCH);
+    }
+    __TBB_ASSERT(new_coroutine, NULL);
+    prev_coroutine = GetCurrentFiber();
+    __TBB_ASSERT(prev_coroutine, NULL);
+    SwitchToFiber(new_coroutine);
+}
+
+inline void destroy_coroutine(coroutine_type& c) {
+    __TBB_ASSERT(c, NULL);
+    DeleteFiber(c);
+}
+#else // !(_WIN32 || _WIN64)
+
+inline void create_coroutine(coroutine_type& c, std::size_t stack_size, void* arg) {
+    const std::size_t REG_PAGE_SIZE = governor::default_page_size();
+    const std::size_t page_aligned_stack_size = (stack_size + (REG_PAGE_SIZE - 1)) & ~(REG_PAGE_SIZE - 1);
+    const std::size_t protected_stack_size = page_aligned_stack_size + 2 * REG_PAGE_SIZE;
+
+    // Allocate the stack with protection property
+    std::uintptr_t stack_ptr = (std::uintptr_t)mmap(NULL, protected_stack_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+    __TBB_ASSERT((void*)stack_ptr != MAP_FAILED, NULL);
+
+    // Allow read write on our stack (guarded pages are still protected)
+    int err = mprotect((void*)(stack_ptr + REG_PAGE_SIZE), page_aligned_stack_size, PROT_READ | PROT_WRITE);
+    __TBB_ASSERT_EX(!err, NULL);
+
+    // Remember the stack state
+    c.my_stack = (void*)(stack_ptr + REG_PAGE_SIZE);
+    c.my_stack_size = page_aligned_stack_size;
+
+    err = getcontext(&c.my_context);
+    __TBB_ASSERT_EX(!err, NULL);
+
+    c.my_context.uc_link = 0;
+    // cast to char* to disable FreeBSD clang-3.4.1 'incompatible type' error
+    c.my_context.uc_stack.ss_sp = (char*)c.my_stack;
+    c.my_context.uc_stack.ss_size = c.my_stack_size;
+    c.my_context.uc_stack.ss_flags = 0;
+
+    typedef void(*coroutine_func_t)();
+    makecontext(&c.my_context, (coroutine_func_t)co_local_wait_for_all, sizeof(arg) / sizeof(int), arg);
+}
+
+inline void current_coroutine(coroutine_type& c) {
+    int err = getcontext(&c.my_context);
+    __TBB_ASSERT_EX(!err, NULL);
+}
+
+inline void swap_coroutine(coroutine_type& prev_coroutine, coroutine_type& new_coroutine) {
+    int err = swapcontext(&prev_coroutine.my_context, &new_coroutine.my_context);
+    __TBB_ASSERT_EX(!err, NULL);
+}
+
+inline void destroy_coroutine(coroutine_type& c) {
+    const std::size_t REG_PAGE_SIZE = governor::default_page_size();
+    // Free stack memory with guarded pages
+    munmap((void*)((std::uintptr_t)c.my_stack - REG_PAGE_SIZE), c.my_stack_size + 2 * REG_PAGE_SIZE);
+    // Clear the stack state afterwards
+    c.my_stack = NULL;
+    c.my_stack_size = 0;
+}
+
+#if __APPLE__
+    #if __INTEL_COMPILER
+        #pragma warning(pop) // 1478 warning
+    #elif __clang__
+        #pragma clang diagnostic pop // "-Wdeprecated-declarations"
+    #endif
+#endif
+
+#endif // _WIN32 || _WIN64
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB_RESUMABLE_TASKS */
+
+#endif /* _TBB_co_context_H */
+
diff --git a/contrib/libs/tbb/src/tbb/concurrent_bounded_queue.cpp b/contrib/libs/tbb/src/tbb/concurrent_bounded_queue.cpp
new file mode 100644
index 0000000000..90077936f6
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/concurrent_bounded_queue.cpp
@@ -0,0 +1,84 @@
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "oneapi/tbb/detail/_utils.h"
+#include "oneapi/tbb/concurrent_queue.h"
+#include "oneapi/tbb/cache_aligned_allocator.h"
+#include "concurrent_monitor.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+static constexpr std::size_t monitors_number = 2;
+
+std::uint8_t* __TBB_EXPORTED_FUNC allocate_bounded_queue_rep( std::size_t queue_rep_size )
+{
+    std::size_t monitors_mem_size = sizeof(concurrent_monitor) * monitors_number;
+    std::uint8_t* mem = static_cast<std::uint8_t*>(cache_aligned_allocate(queue_rep_size + monitors_mem_size));
+
+    concurrent_monitor* monitors = reinterpret_cast<concurrent_monitor*>(mem + queue_rep_size);
+    for (std::size_t i = 0; i < monitors_number; ++i) {
+        new (monitors + i) concurrent_monitor();
+    }
+
+    return mem;
+}
+
+void __TBB_EXPORTED_FUNC deallocate_bounded_queue_rep( std::uint8_t* mem, std::size_t queue_rep_size )
+{
+    concurrent_monitor* monitors = reinterpret_cast<concurrent_monitor*>(mem + queue_rep_size);
+    for (std::size_t i = 0; i < monitors_number; ++i) {
+        monitors[i].~concurrent_monitor();
+    }
+
+    cache_aligned_deallocate(mem);
+}
+
+void __TBB_EXPORTED_FUNC wait_bounded_queue_monitor( concurrent_monitor* monitors, std::size_t monitor_tag,
+                                                        std::ptrdiff_t target, d1::delegate_base& predicate )
+{
+    __TBB_ASSERT(monitor_tag < monitors_number, nullptr);
+    concurrent_monitor& monitor = monitors[monitor_tag];
+
+    monitor.wait<concurrent_monitor::thread_context>([&] { return !predicate(); }, std::uintptr_t(target));
+}
+
+void __TBB_EXPORTED_FUNC abort_bounded_queue_monitors( concurrent_monitor* monitors ) {
+    concurrent_monitor& items_avail = monitors[d1::cbq_items_avail_tag];
+    concurrent_monitor& slots_avail = monitors[d1::cbq_slots_avail_tag];
+
+    items_avail.abort_all();
+    slots_avail.abort_all();
+}
+
+struct predicate_leq {
+    std::size_t my_ticket;
+    predicate_leq( std::size_t ticket ) : my_ticket(ticket) {}
+    bool operator() ( std::uintptr_t ticket ) const { return static_cast<std::size_t>(ticket) <= my_ticket; }
+};
+
+void __TBB_EXPORTED_FUNC notify_bounded_queue_monitor( concurrent_monitor* monitors,
+                                                               std::size_t monitor_tag, std::size_t ticket)
+{
+    __TBB_ASSERT(monitor_tag < monitors_number, nullptr);
+    concurrent_monitor& monitor = monitors[monitor_tag];
+    monitor.notify(predicate_leq(ticket));
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/contrib/libs/tbb/src/tbb/concurrent_monitor.h b/contrib/libs/tbb/src/tbb/concurrent_monitor.h
new file mode 100644
index 0000000000..cb1885a5d0
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/concurrent_monitor.h
@@ -0,0 +1,529 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_monitor_H
+#define __TBB_concurrent_monitor_H
+
+#include "oneapi/tbb/spin_mutex.h"
+#include "oneapi/tbb/detail/_exception.h"
+#include "oneapi/tbb/detail/_aligned_space.h"
+#include "oneapi/tbb/detail/_template_helpers.h"
+#include "scheduler_common.h"
+
+#include "semaphore.h"
+
+#include <atomic>
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//! Circular doubly-linked list with sentinel
+/** head.next points to the front and head.prev points to the back */
+class circular_doubly_linked_list_with_sentinel : no_copy {
+public:
+    struct base_node {
+        base_node* next;
+        base_node* prev;
+        explicit base_node() : next((base_node*)(uintptr_t)0xcdcdcdcd), prev((base_node*)(uintptr_t)0xcdcdcdcd) {}
+    };
+
+    // ctor
+    circular_doubly_linked_list_with_sentinel() { clear(); }
+    // dtor
+    ~circular_doubly_linked_list_with_sentinel() {
+        __TBB_ASSERT(head.next == &head && head.prev == &head, "the list is not empty");
+    }
+
+    inline std::size_t size() const { return count.load(std::memory_order_relaxed); }
+    inline bool empty() const { return size() == 0; }
+    inline base_node* front() const { return head.next; }
+    inline base_node* last() const { return head.prev; }
+    inline const base_node* end() const { return &head; }
+
+    //! add to the back of the list
+    inline void add( base_node* n ) {
+        count.store(count.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed);
+        n->prev = head.prev;
+        n->next = &head;
+        head.prev->next = n;
+        head.prev = n;
+    }
+
+    //! remove node 'n'
+    inline void remove( base_node& n ) {
+        __TBB_ASSERT(count.load(std::memory_order_relaxed) > 0, "attempt to remove an item from an empty list");
+        count.store(count.load( std::memory_order_relaxed ) - 1, std::memory_order_relaxed);
+        n.prev->next = n.next;
+        n.next->prev = n.prev;
+    }
+
+    //! move all elements to 'lst' and initialize the 'this' list
+    inline void flush_to( circular_doubly_linked_list_with_sentinel& lst ) {
+        const std::size_t l_count = size();
+        if (l_count > 0) {
+            lst.count.store(l_count, std::memory_order_relaxed);
+            lst.head.next = head.next;
+            lst.head.prev = head.prev;
+            head.next->prev = &lst.head;
+            head.prev->next = &lst.head;
+            clear();
+        }
+    }
+
+    void clear() {
+        head.next = &head;
+        head.prev = &head;
+        count.store(0, std::memory_order_relaxed);
+    }
+private:
+    std::atomic<std::size_t> count;
+    base_node head;
+};
+
+using base_list = circular_doubly_linked_list_with_sentinel;
+using base_node = circular_doubly_linked_list_with_sentinel::base_node;
+
+template <typename Context>
+class concurrent_monitor_base;
+
+template <typename Context>
+class wait_node : public base_node {
+public:
+
+#if __TBB_GLIBCXX_VERSION >= 40800 && __TBB_GLIBCXX_VERSION < 40900
+    wait_node(Context ctx) : my_context(ctx), my_is_in_list(false) {}
+#else
+    wait_node(Context ctx) : my_context(ctx) {}
+#endif
+
+    virtual ~wait_node() = default;
+
+    virtual void init() {
+        __TBB_ASSERT(!my_initialized, nullptr);
+        my_initialized = true;
+    }
+
+    virtual void wait() = 0;
+
+    virtual void reset() {
+        __TBB_ASSERT(my_skipped_wakeup, nullptr);
+        my_skipped_wakeup = false;
+    }
+
+    virtual void notify() = 0;
+
+protected:
+    friend class concurrent_monitor_base<Context>;
+    friend class thread_data;
+
+    Context my_context{};
+#if __TBB_GLIBCXX_VERSION >= 40800 && __TBB_GLIBCXX_VERSION < 40900
+    std::atomic<bool> my_is_in_list;
+#else
+    std::atomic<bool> my_is_in_list{false};
+#endif
+
+    bool my_initialized{false};
+    bool my_skipped_wakeup{false};
+    bool my_aborted{false};
+    unsigned my_epoch{0};
+};
+
+template <typename Context>
+class sleep_node : public wait_node<Context> {
+    using base_type = wait_node<Context>;
+public:
+    using base_type::base_type;
+
+    // Make it virtual due to Intel Compiler warning
+    virtual ~sleep_node() {
+        if (this->my_initialized) {
+            if (this->my_skipped_wakeup) semaphore().P();
+            semaphore().~binary_semaphore();
+        }
+    }
+
+    binary_semaphore& semaphore() { return *sema.begin(); }
+
+    void init() override {
+        if (!this->my_initialized) {
+            new (sema.begin()) binary_semaphore;
+            base_type::init();
+        }
+    }
+
+    void wait() override {
+        __TBB_ASSERT(this->my_initialized,
+            "Use of commit_wait() without prior prepare_wait()");
+        semaphore().P();
+        __TBB_ASSERT(!this->my_is_in_list.load(std::memory_order_relaxed), "Still in the queue?");
+        if (this->my_aborted)
+            throw_exception(exception_id::user_abort);
+    }
+
+    void reset() override {
+        base_type::reset();
+        semaphore().P();
+    }
+
+    void notify() override {
+        semaphore().V();
+    }
+
+private:
+    tbb::detail::aligned_space<binary_semaphore> sema;
+};
+
+//! concurrent_monitor
+/** fine-grained concurrent_monitor implementation */
+template <typename Context>
+class concurrent_monitor_base : no_copy {
+public:
+    //! ctor
+    concurrent_monitor_base() : my_epoch{}
+    {}
+
+    //! dtor
+    ~concurrent_monitor_base() {
+        abort_all();
+        __TBB_ASSERT(my_waitset.empty(), "waitset not empty?");
+    }
+
+    //! prepare wait by inserting 'thr' into the wait queue
+    void prepare_wait( wait_node<Context>& node) {
+        // TODO: consider making even more lazy instantiation of the semaphore, that is only when it is actually needed, e.g. move it in node::wait()
+        if (!node.my_initialized) {
+            node.init();
+        }
+        // this is good place to pump previous skipped wakeup
+        else if (node.my_skipped_wakeup) {
+            node.reset();
+        }
+
+        node.my_is_in_list.store(true, std::memory_order_relaxed);
+
+        {
+            tbb::spin_mutex::scoped_lock l(my_mutex);
+            node.my_epoch = my_epoch.load(std::memory_order_relaxed);
+            my_waitset.add(&node);
+        }
+
+        // Prepare wait guarantees Write Read memory barrier.
+        // In C++ only full fence covers this type of barrier.
+        atomic_fence(std::memory_order_seq_cst);
+    }
+
+    //! Commit wait if event count has not changed; otherwise, cancel wait.
+    /** Returns true if committed, false if canceled. */
+    inline bool commit_wait( wait_node<Context>& node ) {
+        const bool do_it = node.my_epoch == my_epoch.load(std::memory_order_relaxed);
+        // this check is just an optimization
+        if (do_it) {
+           node.wait();
+        } else {
+            cancel_wait( node );
+        }
+        return do_it;
+    }
+
+    //! Cancel the wait. Removes the thread from the wait queue if not removed yet.
+    void cancel_wait( wait_node<Context>& node ) {
+        // possible skipped wakeup will be pumped in the following prepare_wait()
+        node.my_skipped_wakeup = true;
+        // try to remove node from waitset
+        // Cancel wait guarantees acquire memory barrier.
+        bool in_list = node.my_is_in_list.load(std::memory_order_acquire);
+        if (in_list) {
+            tbb::spin_mutex::scoped_lock l(my_mutex);
+            if (node.my_is_in_list.load(std::memory_order_relaxed)) {
+                my_waitset.remove(node);
+                // node is removed from waitset, so there will be no wakeup
+                node.my_is_in_list.store(false, std::memory_order_relaxed);
+                node.my_skipped_wakeup = false;
+            }
+        }
+    }
+
+    //! Wait for a condition to be satisfied with waiting-on my_context
+    template <typename NodeType, typename Pred>
+    bool wait(Pred&& pred, NodeType&& node) {
+        prepare_wait(node);
+        while (!guarded_call(std::forward<Pred>(pred), node)) {
+            if (commit_wait(node)) {
+                return true;
+            }
+
+            prepare_wait(node);
+        }
+
+        cancel_wait(node);
+        return false;
+    }
+
+    //! Notify one thread about the event
+    void notify_one() {
+        atomic_fence(std::memory_order_seq_cst);
+        notify_one_relaxed();
+    }
+
+    //! Notify one thread about the event. Relaxed version.
+    void notify_one_relaxed() {
+        if (my_waitset.empty()) {
+            return;
+        }
+
+        base_node* n;
+        const base_node* end = my_waitset.end();
+        {
+            tbb::spin_mutex::scoped_lock l(my_mutex);
+            my_epoch.store(my_epoch.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed);
+            n = my_waitset.front();
+            if (n != end) {
+                my_waitset.remove(*n);
+                to_wait_node(n)->my_is_in_list.store(false, std::memory_order_relaxed);
+            }
+        }
+
+        if (n != end) {
+            to_wait_node(n)->notify();
+        }
+    }
+
+    //! Notify all waiting threads of the event
+    void notify_all() {
+        atomic_fence(std::memory_order_seq_cst);
+        notify_all_relaxed();
+    }
+
+    // ! Notify all waiting threads of the event; Relaxed version
+    void notify_all_relaxed() {
+        if (my_waitset.empty()) {
+            return;
+        }
+
+        base_list temp;
+        const base_node* end;
+        {
+            tbb::spin_mutex::scoped_lock l(my_mutex);
+            my_epoch.store(my_epoch.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed);
+            // TODO: Possible optimization, don't change node state under lock, just do flush
+            my_waitset.flush_to(temp);
+            end = temp.end();
+            for (base_node* n = temp.front(); n != end; n = n->next) {
+                to_wait_node(n)->my_is_in_list.store(false, std::memory_order_relaxed);
+            }
+        }
+
+        base_node* nxt;
+        for (base_node* n = temp.front(); n != end; n=nxt) {
+            nxt = n->next;
+            to_wait_node(n)->notify();
+        }
+#if TBB_USE_ASSERT
+        temp.clear();
+#endif
+    }
+
+    //! Notify waiting threads of the event that satisfies the given predicate
+    template <typename P>
+    void notify( const P& predicate ) {
+        atomic_fence(std::memory_order_seq_cst);
+        notify_relaxed( predicate );
+    }
+
+    //! Notify waiting threads of the event that satisfies the given predicate;
+    //! the predicate is called under the lock. Relaxed version.
+    template<typename P>
+    void notify_relaxed( const P& predicate ) {
+        if (my_waitset.empty()) {
+            return;
+        }
+
+        base_list temp;
+        base_node* nxt;
+        const base_node* end = my_waitset.end();
+        {
+            tbb::spin_mutex::scoped_lock l(my_mutex);
+            my_epoch.store(my_epoch.load( std::memory_order_relaxed ) + 1, std::memory_order_relaxed);
+            for (base_node* n = my_waitset.last(); n != end; n = nxt) {
+                nxt = n->prev;
+                auto* node = static_cast<wait_node<Context>*>(n);
+                if (predicate(node->my_context)) {
+                    my_waitset.remove(*n);
+                    node->my_is_in_list.store(false, std::memory_order_relaxed);
+                    temp.add(n);
+                }
+            }
+        }
+
+        end = temp.end();
+        for (base_node* n=temp.front(); n != end; n = nxt) {
+            nxt = n->next;
+            to_wait_node(n)->notify();
+        }
+#if TBB_USE_ASSERT
+        temp.clear();
+#endif
+    }
+
+    //! Abort any sleeping threads at the time of the call
+    void abort_all() {
+        atomic_fence( std::memory_order_seq_cst );
+        abort_all_relaxed();
+    }
+
+    //! Abort any sleeping threads at the time of the call; Relaxed version
+    void abort_all_relaxed() {
+        if (my_waitset.empty()) {
+            return;
+        }
+
+        base_list temp;
+        const base_node* end;
+        {
+            tbb::spin_mutex::scoped_lock l(my_mutex);
+            my_epoch.store(my_epoch.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed);
+            my_waitset.flush_to(temp);
+            end = temp.end();
+            for (base_node* n = temp.front(); n != end; n = n->next) {
+                to_wait_node(n)->my_is_in_list.store(false, std::memory_order_relaxed);
+            }
+        }
+
+        base_node* nxt;
+        for (base_node* n = temp.front(); n != end; n = nxt) {
+            nxt = n->next;
+            to_wait_node(n)->my_aborted = true;
+            to_wait_node(n)->notify();
+        }
+#if TBB_USE_ASSERT
+        temp.clear();
+#endif
+    }
+
+private:
+    template <typename NodeType, typename Pred>
+    bool guarded_call(Pred&& predicate, NodeType& node) {
+        bool res = false;
+        tbb::detail::d0::try_call( [&] {
+            res = std::forward<Pred>(predicate)();
+        }).on_exception( [&] {
+            cancel_wait(node);
+        });
+
+        return res;
+    }
+
+    tbb::spin_mutex my_mutex;
+    base_list my_waitset;
+    std::atomic<unsigned> my_epoch;
+
+    wait_node<Context>* to_wait_node( base_node* node ) { return static_cast<wait_node<Context>*>(node); }
+};
+
+class concurrent_monitor : public concurrent_monitor_base<std::uintptr_t> {
+    using base_type = concurrent_monitor_base<std::uintptr_t>;
+public:
+    using base_type::base_type;
+    /** per-thread descriptor for concurrent_monitor */
+    using thread_context = sleep_node<std::uintptr_t>;
+};
+
+struct extended_context {
+    extended_context() = default;
+
+    extended_context(std::uintptr_t first_addr, arena* a) :
+        my_uniq_addr(first_addr), my_arena_addr(a)
+    {}
+
+    std::uintptr_t my_uniq_addr{0};
+    arena* my_arena_addr{nullptr};
+};
+
+
+#if __TBB_RESUMABLE_TASKS
+class resume_node : public wait_node<extended_context> {
+    using base_type = wait_node<extended_context>;
+public:
+    resume_node(extended_context ctx, execution_data_ext& ed_ext, task_dispatcher& target)
+        : base_type(ctx), my_curr_dispatcher(ed_ext.task_disp), my_target_dispatcher(&target)
+        , my_suspend_point(my_curr_dispatcher->get_suspend_point())
+    {}
+
+    virtual ~resume_node() {
+        if (this->my_skipped_wakeup) {
+            spin_wait_until_eq(this->my_notify_calls, 1);
+        }
+
+        poison_pointer(my_curr_dispatcher);
+        poison_pointer(my_target_dispatcher);
+        poison_pointer(my_suspend_point);
+    }
+
+    void init() override {
+        base_type::init();
+    }
+
+    void wait() override {
+        my_curr_dispatcher->resume(*my_target_dispatcher);
+        __TBB_ASSERT(!this->my_is_in_list.load(std::memory_order_relaxed), "Still in the queue?");
+    }
+
+    void reset() override {
+        base_type::reset();
+        spin_wait_until_eq(this->my_notify_calls, 1);
+        my_notify_calls.store(0, std::memory_order_relaxed);
+    }
+
+    // notify is called (perhaps, concurrently) twice from:
+    //   - concurrent_monitor::notify
+    //   - post_resume_action::register_waiter
+    // The second notify is called after thread switches the stack
+    // (Because we can not call resume while the stack is occupied)
+    // We need calling resume only when both notifications are performed.
+    void notify() override {
+        if (++my_notify_calls == 2) {
+            r1::resume(my_suspend_point);
+        }
+    }
+
+private:
+    friend class thread_data;
+    friend struct suspend_point_type::resume_task;
+    task_dispatcher* my_curr_dispatcher;
+    task_dispatcher* my_target_dispatcher;
+    suspend_point_type* my_suspend_point;
+    std::atomic<int> my_notify_calls{0};
+};
+#endif // __TBB_RESUMABLE_TASKS
+
+class extended_concurrent_monitor : public concurrent_monitor_base<extended_context> {
+    using base_type = concurrent_monitor_base<extended_context>;
+public:
+    using base_type::base_type;
+    /** per-thread descriptor for concurrent_monitor */
+    using thread_context = sleep_node<extended_context>;
+#if __TBB_RESUMABLE_TASKS
+    using resume_context = resume_node;
+#endif
+};
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB_concurrent_monitor_H */
diff --git a/contrib/libs/tbb/src/tbb/def/lin64-tbb.def b/contrib/libs/tbb/src/tbb/def/lin64-tbb.def
new file mode 100644
index 0000000000..09e7753ad4
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/def/lin64-tbb.def
@@ -0,0 +1,153 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+{
+global:
+
+/* Assertions (assert.cpp) */
+_ZN3tbb6detail2r117assertion_failureEPKciS3_S3_;
+
+/* ITT (profiling.cpp) */
+_ZN3tbb6detail2r112itt_task_endENS0_2d115itt_domain_enumE;
+_ZN3tbb6detail2r114itt_region_endENS0_2d115itt_domain_enumEPvy;
+_ZN3tbb6detail2r114itt_task_beginENS0_2d115itt_domain_enumEPvyS4_yNS0_2d021string_resource_indexE;
+_ZN3tbb6detail2r115call_itt_notifyEiPv;
+_ZN3tbb6detail2r115create_itt_syncEPvPKcS4_;
+_ZN3tbb6detail2r116itt_region_beginENS0_2d115itt_domain_enumEPvyS4_yNS0_2d021string_resource_indexE;
+_ZN3tbb6detail2r116itt_relation_addENS0_2d115itt_domain_enumEPvyNS0_2d012itt_relationES4_y;
+_ZN3tbb6detail2r117itt_set_sync_nameEPvPKc;
+_ZN3tbb6detail2r119itt_make_task_groupENS0_2d115itt_domain_enumEPvyS4_yNS0_2d021string_resource_indexE;
+_ZN3tbb6detail2r120itt_metadata_str_addENS0_2d115itt_domain_enumEPvyNS0_2d021string_resource_indexEPKc;
+_ZN3tbb6detail2r120itt_metadata_ptr_addENS0_2d115itt_domain_enumEPvyNS0_2d021string_resource_indexES4_;
+
+/* Allocators (allocator.cpp) */
+_ZN3tbb6detail2r115allocate_memoryEm;
+_ZN3tbb6detail2r117deallocate_memoryEPv;
+_ZN3tbb6detail2r122cache_aligned_allocateEm;
+_ZN3tbb6detail2r124cache_aligned_deallocateEPv;
+_ZN3tbb6detail2r115cache_line_sizeEv;
+_ZN3tbb6detail2r117is_tbbmalloc_usedEv;
+
+/* Small object pool (small_object_pool.cpp) */
+_ZN3tbb6detail2r18allocateERPNS0_2d117small_object_poolEm;
+_ZN3tbb6detail2r18allocateERPNS0_2d117small_object_poolEmRKNS2_14execution_dataE;
+_ZN3tbb6detail2r110deallocateERNS0_2d117small_object_poolEPvm;
+_ZN3tbb6detail2r110deallocateERNS0_2d117small_object_poolEPvmRKNS2_14execution_dataE;
+
+/* Error handling (exception.cpp) */
+_ZN3tbb6detail2r115throw_exceptionENS0_2d012exception_idE;
+_ZTIN3tbb6detail2r114bad_last_allocE;
+_ZTVN3tbb6detail2r114bad_last_allocE;
+_ZTIN3tbb6detail2r112missing_waitE;
+_ZTVN3tbb6detail2r112missing_waitE;
+_ZTIN3tbb6detail2r110user_abortE;
+_ZTVN3tbb6detail2r110user_abortE;
+_ZTIN3tbb6detail2r111unsafe_waitE;
+_ZTVN3tbb6detail2r111unsafe_waitE;
+
+/* RTM Mutex (rtm_mutex.cpp) */
+_ZN3tbb6detail2r17acquireERNS0_2d19rtm_mutexERNS3_11scoped_lockEb;
+_ZN3tbb6detail2r17releaseERNS0_2d19rtm_mutex11scoped_lockE;
+_ZN3tbb6detail2r111try_acquireERNS0_2d19rtm_mutexERNS3_11scoped_lockE;
+
+/* RTM RW Mutex (rtm_rw_mutex.cpp) */
+_ZN3tbb6detail2r114acquire_readerERNS0_2d112rtm_rw_mutexERNS3_11scoped_lockEb;
+_ZN3tbb6detail2r114acquire_writerERNS0_2d112rtm_rw_mutexERNS3_11scoped_lockEb;
+_ZN3tbb6detail2r118try_acquire_readerERNS0_2d112rtm_rw_mutexERNS3_11scoped_lockE;
+_ZN3tbb6detail2r118try_acquire_writerERNS0_2d112rtm_rw_mutexERNS3_11scoped_lockE;
+_ZN3tbb6detail2r17releaseERNS0_2d112rtm_rw_mutex11scoped_lockE;
+_ZN3tbb6detail2r17upgradeERNS0_2d112rtm_rw_mutex11scoped_lockE;
+_ZN3tbb6detail2r19downgradeERNS0_2d112rtm_rw_mutex11scoped_lockE;
+
+/* Tasks and partitioners (task.cpp) */
+_ZN3tbb6detail2r17suspendEPFvPvPNS1_18suspend_point_typeEES2_;
+_ZN3tbb6detail2r16resumeEPNS1_18suspend_point_typeE;
+_ZN3tbb6detail2r121current_suspend_pointEv;
+_ZN3tbb6detail2r114notify_waitersEm;
+
+/* Task dispatcher (task_dispatcher.cpp) */
+_ZN3tbb6detail2r114execution_slotEPKNS0_2d114execution_dataE;
+_ZN3tbb6detail2r14waitERNS0_2d112wait_contextERNS2_18task_group_contextE;
+_ZN3tbb6detail2r15spawnERNS0_2d14taskERNS2_18task_group_contextE;
+_ZN3tbb6detail2r15spawnERNS0_2d14taskERNS2_18task_group_contextEt;
+_ZN3tbb6detail2r116execute_and_waitERNS0_2d14taskERNS2_18task_group_contextERNS2_12wait_contextES6_;
+_ZN3tbb6detail2r16submitERNS0_2d14taskERNS2_18task_group_contextEPNS1_5arenaEm;
+_ZN3tbb6detail2r115current_contextEv;
+
+/* Task group context (task_group_context.cpp) */
+_ZN3tbb6detail2r110initializeERNS0_2d118task_group_contextE;
+_ZN3tbb6detail2r122cancel_group_executionERNS0_2d118task_group_contextE;
+_ZN3tbb6detail2r128is_group_execution_cancelledERNS0_2d118task_group_contextE;
+_ZN3tbb6detail2r15resetERNS0_2d118task_group_contextE;
+_ZN3tbb6detail2r17destroyERNS0_2d118task_group_contextE;
+_ZN3tbb6detail2r119capture_fp_settingsERNS0_2d118task_group_contextE;
+
+/* Task arena (arena.cpp) */
+_ZN3tbb6detail2r115max_concurrencyEPKNS0_2d115task_arena_baseE;
+_ZN3tbb6detail2r110initializeERNS0_2d115task_arena_baseE;
+_ZN3tbb6detail2r16attachERNS0_2d115task_arena_baseE;
+_ZN3tbb6detail2r17executeERNS0_2d115task_arena_baseERNS2_13delegate_baseE;
+_ZN3tbb6detail2r19terminateERNS0_2d115task_arena_baseE;
+_ZN3tbb6detail2r120isolate_within_arenaERNS0_2d113delegate_baseEl;
+_ZN3tbb6detail2r17enqueueERNS0_2d14taskEPNS2_15task_arena_baseE;
+_ZN3tbb6detail2r14waitERNS0_2d115task_arena_baseE;
+
+/* System topology parsing and threads pinning (governor.cpp) */
+_ZN3tbb6detail2r115numa_node_countEv;
+_ZN3tbb6detail2r117fill_numa_indicesEPi;
+_ZN3tbb6detail2r115core_type_countEl;
+_ZN3tbb6detail2r122fill_core_type_indicesEPil;
+_ZN3tbb6detail2r131constraints_default_concurrencyERKNS0_2d111constraintsEl;
+_ZN3tbb6detail2r128constraints_threads_per_coreERKNS0_2d111constraintsEl;
+_ZN3tbb6detail2r124numa_default_concurrencyEi;
+
+/* Observer (observer_proxy.cpp) */
+_ZN3tbb6detail2r17observeERNS0_2d123task_scheduler_observerEb;
+
+/* Queuing RW Mutex (queuing_rw_mutex.cpp) */
+_ZN3tbb6detail2r111try_acquireERNS0_2d116queuing_rw_mutexERNS3_11scoped_lockEb;
+_ZN3tbb6detail2r117upgrade_to_writerERNS0_2d116queuing_rw_mutex11scoped_lockE;
+_ZN3tbb6detail2r119downgrade_to_readerERNS0_2d116queuing_rw_mutex11scoped_lockE;
+_ZN3tbb6detail2r17acquireERNS0_2d116queuing_rw_mutexERNS3_11scoped_lockEb;
+_ZN3tbb6detail2r17releaseERNS0_2d116queuing_rw_mutex11scoped_lockE;
+_ZN3tbb6detail2r19constructERNS0_2d116queuing_rw_mutexE;
+
+/* Global control (global_control.cpp) */
+_ZN3tbb6detail2r16createERNS0_2d114global_controlE;
+_ZN3tbb6detail2r17destroyERNS0_2d114global_controlE;
+_ZN3tbb6detail2r127global_control_active_valueEi;
+_ZN3tbb6detail2r18finalizeERNS0_2d121task_scheduler_handleEl;
+_ZN3tbb6detail2r13getERNS0_2d121task_scheduler_handleE;
+
+/* Parallel pipeline (parallel_pipeline.cpp) */
+_ZN3tbb6detail2r117parallel_pipelineERNS0_2d118task_group_contextEmRKNS2_11filter_nodeE;
+_ZN3tbb6detail2r116set_end_of_inputERNS0_2d111base_filterE;
+
+/* Concurrent bounded queue (concurrent_bounded_queue.cpp) */
+_ZN3tbb6detail2r126allocate_bounded_queue_repEm;
+_ZN3tbb6detail2r126wait_bounded_queue_monitorEPNS1_18concurrent_monitorEmlRNS0_2d113delegate_baseE;
+_ZN3tbb6detail2r128abort_bounded_queue_monitorsEPNS1_18concurrent_monitorE;
+_ZN3tbb6detail2r128deallocate_bounded_queue_repEPhm;
+_ZN3tbb6detail2r128notify_bounded_queue_monitorEPNS1_18concurrent_monitorEmm;
+
+/* Versioning (version.cpp) */
+TBB_runtime_interface_version;
+TBB_runtime_version;
+
+local:
+/* TODO: fill more precisely */
+*;
+};
diff --git a/contrib/libs/tbb/src/tbb/dynamic_link.cpp b/contrib/libs/tbb/src/tbb/dynamic_link.cpp
new file mode 100644
index 0000000000..d5c5c7be7d
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/dynamic_link.cpp
@@ -0,0 +1,477 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "dynamic_link.h"
+
+#include "oneapi/tbb/detail/_template_helpers.h"
+#include "oneapi/tbb/detail/_utils.h"
+
+/*
+    This file is used by both TBB and OpenMP RTL. Do not use __TBB_ASSERT() macro
+    and runtime_warning() function because they are not available in OpenMP. Use
+    __TBB_ASSERT_EX and DYNAMIC_LINK_WARNING instead.
+*/
+
+#include <cstdarg>          // va_list etc.
+#if _WIN32
+    #include <malloc.h>
+
+    // Unify system calls
+    #define dlopen( name, flags )   LoadLibrary( name )
+    #define dlsym( handle, name )   GetProcAddress( handle, name )
+    #define dlclose( handle )       ( ! FreeLibrary( handle ) )
+    #define dlerror()               GetLastError()
+#ifndef PATH_MAX
+    #define PATH_MAX                MAX_PATH
+#endif
+#else /* _WIN32 */
+    #include <dlfcn.h>
+    #include <unistd.h>
+
+    #include <cstring>
+    #include <climits>
+    #include <cstdlib>
+#endif /* _WIN32 */
+
+#if __TBB_WEAK_SYMBOLS_PRESENT && !__TBB_DYNAMIC_LOAD_ENABLED
+    //TODO: use function attribute for weak symbols instead of the pragma.
+    #pragma weak dlopen
+    #pragma weak dlsym
+    #pragma weak dlclose
+#endif /* __TBB_WEAK_SYMBOLS_PRESENT && !__TBB_DYNAMIC_LOAD_ENABLED */
+
+
+#define __USE_STATIC_DL_INIT    ( !__ANDROID__ )
+
+
+/*
+dynamic_link is a common interface for searching for required symbols in an
+executable and dynamic libraries.
+
+dynamic_link provides certain guarantees:
+  1. Either all or none of the requested symbols are resolved. Moreover, if
+  symbols are not resolved, the dynamic_link_descriptor table is not modified;
+  2. All returned symbols have secured lifetime: this means that none of them
+  can be invalidated until dynamic_unlink is called;
+  3. Any loaded library is loaded only via the full path. The full path is that
+  from which the runtime itself was loaded. (This is done to avoid security
+  issues caused by loading libraries from insecure paths).
+
+dynamic_link searches for the requested symbols in three stages, stopping as
+soon as all of the symbols have been resolved.
+
+  1. Search the global scope:
+    a. On Windows: dynamic_link tries to obtain the handle of the requested
+    library and if it succeeds it resolves the symbols via that handle.
+    b. On Linux: dynamic_link tries to search for the symbols in the global
+    scope via the main program handle. If the symbols are present in the global
+    scope their lifetime is not guaranteed (since dynamic_link does not know
+    anything about the library from which they are exported). Therefore it
+    tries to "pin" the symbols by obtaining the library name and reopening it.
+    dlopen may fail to reopen the library in two cases:
+       i. The symbols are exported from the executable. Currently dynamic _link
+      cannot handle this situation, so it will not find these symbols in this
+      step.
+      ii. The necessary library has been unloaded and cannot be reloaded. It
+      seems there is nothing that can be done in this case. No symbols are
+      returned.
+
+  2. Dynamic load: an attempt is made to load the requested library via the
+  full path.
+    The full path used is that from which the runtime itself was loaded. If the
+    library can be loaded, then an attempt is made to resolve the requested
+    symbols in the newly loaded library.
+    If the symbols are not found the library is unloaded.
+
+  3. Weak symbols: if weak symbols are available they are returned.
+*/
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if __TBB_WEAK_SYMBOLS_PRESENT || __TBB_DYNAMIC_LOAD_ENABLED
+
+#if !defined(DYNAMIC_LINK_WARNING) && !__TBB_WIN8UI_SUPPORT && __TBB_DYNAMIC_LOAD_ENABLED
+    // Report runtime errors and continue.
+    #define DYNAMIC_LINK_WARNING dynamic_link_warning
+    static void dynamic_link_warning( dynamic_link_error_t code, ... ) {
+        suppress_unused_warning(code);
+    } // library_warning
+#endif /* !defined(DYNAMIC_LINK_WARNING) && !__TBB_WIN8UI_SUPPORT && __TBB_DYNAMIC_LOAD_ENABLED */
+
+    static bool resolve_symbols( dynamic_link_handle module, const dynamic_link_descriptor descriptors[], std::size_t required )
+    {
+        if ( !module )
+            return false;
+
+        #if !__TBB_DYNAMIC_LOAD_ENABLED /* only __TBB_WEAK_SYMBOLS_PRESENT is defined */
+            if ( !dlsym ) return false;
+        #endif /* !__TBB_DYNAMIC_LOAD_ENABLED */
+
+        const std::size_t n_desc=20; // Usually we don't have more than 20 descriptors per library
+        __TBB_ASSERT_EX( required <= n_desc, "Too many descriptors is required" );
+        if ( required > n_desc ) return false;
+        pointer_to_handler h[n_desc];
+
+        for ( std::size_t k = 0; k < required; ++k ) {
+            dynamic_link_descriptor const & desc = descriptors[k];
+            pointer_to_handler addr = (pointer_to_handler)dlsym( module, desc.name );
+            if ( !addr ) {
+                return false;
+            }
+            h[k] = addr;
+        }
+
+        // Commit the entry points.
+        // Cannot use memset here, because the writes must be atomic.
+        for( std::size_t k = 0; k < required; ++k )
+            *descriptors[k].handler = h[k];
+        return true;
+    }
+
+#if __TBB_WIN8UI_SUPPORT
+    bool dynamic_link( const char*  library, const dynamic_link_descriptor descriptors[], std::size_t required, dynamic_link_handle*, int flags ) {
+        dynamic_link_handle tmp_handle = NULL;
+        TCHAR wlibrary[256];
+        if ( MultiByteToWideChar(CP_UTF8, 0, library, -1, wlibrary, 255) == 0 ) return false;
+        if ( flags & DYNAMIC_LINK_LOAD )
+            tmp_handle = LoadPackagedLibrary( wlibrary, 0 );
+        if (tmp_handle != NULL){
+            return resolve_symbols(tmp_handle, descriptors, required);
+        }else{
+            return false;
+        }
+    }
+    void dynamic_unlink( dynamic_link_handle ) {}
+    void dynamic_unlink_all() {}
+#else
+#if __TBB_DYNAMIC_LOAD_ENABLED
+/*
+    There is a security issue on Windows: LoadLibrary() may load and execute malicious code.
+    See http://www.microsoft.com/technet/security/advisory/2269637.mspx for details.
+    To avoid the issue, we have to pass full path (not just library name) to LoadLibrary. This
+    function constructs full path to the specified library (it is assumed the library located
+    side-by-side with the tbb.dll.
+
+    The function constructs absolute path for given relative path. Important: Base directory is not
+    current one, it is the directory tbb.dll loaded from.
+
+    Example:
+        Let us assume "tbb.dll" is located in "c:\program files\common\intel\" directory, e.g.
+        absolute path of the library is "c:\program files\common\intel\tbb.dll". Absolute path for
+        "tbbmalloc.dll" would be "c:\program files\common\intel\tbbmalloc.dll". Absolute path for
+        "malloc\tbbmalloc.dll" would be "c:\program files\common\intel\malloc\tbbmalloc.dll".
+*/
+
+    // Struct handle_storage is used by dynamic_link routine to store handles of
+    // all loaded or pinned dynamic libraries. When TBB is shut down, it calls
+    // dynamic_unlink_all() that unloads modules referenced by handle_storage.
+    // This struct should not have any constructors since it may be used before
+    // the constructor is called.
+    #define MAX_LOADED_MODULES 8 // The number of maximum possible modules which can be loaded
+
+    using atomic_incrementer = std::atomic<std::size_t>;
+
+    static struct handles_t {
+        atomic_incrementer my_size;
+        dynamic_link_handle my_handles[MAX_LOADED_MODULES];
+
+        void add(const dynamic_link_handle &handle) {
+            const std::size_t ind = my_size++;
+            __TBB_ASSERT_EX( ind < MAX_LOADED_MODULES, "Too many modules are loaded" );
+            my_handles[ind] = handle;
+        }
+
+        void free() {
+            const std::size_t size = my_size;
+            for (std::size_t i=0; i<size; ++i)
+                dynamic_unlink( my_handles[i] );
+        }
+    } handles;
+
+    static std::once_flag init_dl_data_state;
+
+    static struct ap_data_t {
+        char _path[PATH_MAX+1];
+        std::size_t _len;
+    } ap_data;
+
+    static void init_ap_data() {
+    #if _WIN32
+        // Get handle of our DLL first.
+        HMODULE handle;
+        BOOL brc = GetModuleHandleEx(
+            GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
+            (LPCSTR)( & dynamic_link ), // any function inside the library can be used for the address
+            & handle
+            );
+        if ( !brc ) { // Error occurred.
+            int err = GetLastError();
+            DYNAMIC_LINK_WARNING( dl_sys_fail, "GetModuleHandleEx", err );
+            return;
+        }
+        // Now get path to our DLL.
+        DWORD drc = GetModuleFileName( handle, ap_data._path, static_cast< DWORD >( PATH_MAX ) );
+        if ( drc == 0 ) { // Error occurred.
+            int err = GetLastError();
+            DYNAMIC_LINK_WARNING( dl_sys_fail, "GetModuleFileName", err );
+            return;
+        }
+        if ( drc >= PATH_MAX ) { // Buffer too short.
+            DYNAMIC_LINK_WARNING( dl_buff_too_small );
+            return;
+        }
+        // Find the position of the last backslash.
+        char *backslash = std::strrchr( ap_data._path, '\\' );
+
+        if ( !backslash ) {    // Backslash not found.
+            __TBB_ASSERT_EX( backslash!=NULL, "Unbelievable.");
+            return;
+        }
+        __TBB_ASSERT_EX( backslash >= ap_data._path, "Unbelievable.");
+        ap_data._len = (std::size_t)(backslash - ap_data._path) + 1;
+        *(backslash+1) = 0;
+    #else
+        // Get the library path
+        Dl_info dlinfo;
+        int res = dladdr( (void*)&dynamic_link, &dlinfo ); // any function inside the library can be used for the address
+        if ( !res ) {
+            char const * err = dlerror();
+            DYNAMIC_LINK_WARNING( dl_sys_fail, "dladdr", err );
+            return;
+        } else {
+            __TBB_ASSERT_EX( dlinfo.dli_fname!=NULL, "Unbelievable." );
+        }
+
+        char const *slash = std::strrchr( dlinfo.dli_fname, '/' );
+        std::size_t fname_len=0;
+        if ( slash ) {
+            __TBB_ASSERT_EX( slash >= dlinfo.dli_fname, "Unbelievable.");
+            fname_len = (std::size_t)(slash - dlinfo.dli_fname) + 1;
+        }
+
+        std::size_t rc;
+        if ( dlinfo.dli_fname[0]=='/' ) {
+            // The library path is absolute
+            rc = 0;
+            ap_data._len = 0;
+        } else {
+            // The library path is relative so get the current working directory
+            if ( !getcwd( ap_data._path, sizeof(ap_data._path)/sizeof(ap_data._path[0]) ) ) {
+                DYNAMIC_LINK_WARNING( dl_buff_too_small );
+                return;
+            }
+            ap_data._len = std::strlen( ap_data._path );
+            ap_data._path[ap_data._len++]='/';
+            rc = ap_data._len;
+        }
+
+        if ( fname_len>0 ) {
+            if ( ap_data._len>PATH_MAX ) {
+                DYNAMIC_LINK_WARNING( dl_buff_too_small );
+                ap_data._len=0;
+                return;
+            }
+            std::strncpy( ap_data._path+rc, dlinfo.dli_fname, fname_len );
+            ap_data._len += fname_len;
+            ap_data._path[ap_data._len]=0;
+        }
+    #endif /* _WIN32 */
+    }
+
+    static void init_dl_data() {
+        init_ap_data();
+    }
+
+    /*
+        The function constructs absolute path for given relative path. Important: Base directory is not
+        current one, it is the directory libtbb.so loaded from.
+
+        Arguments:
+        in  name -- Name of a file (may be with relative path; it must not be an absolute one).
+        out path -- Buffer to save result (absolute path) to.
+        in  len  -- Size of buffer.
+        ret      -- 0         -- Error occurred.
+                    > len     -- Buffer too short, required size returned.
+                    otherwise -- Ok, number of characters (incl. terminating null) written to buffer.
+    */
+    static std::size_t abs_path( char const * name, char * path, std::size_t len ) {
+        if ( ap_data._len == 0 )
+            return 0;
+
+        std::size_t name_len = std::strlen( name );
+        std::size_t full_len = name_len+ap_data._len;
+        if ( full_len < len ) {
+            __TBB_ASSERT( ap_data._path[ap_data._len] == 0, NULL);
+            __TBB_ASSERT( std::strlen(ap_data._path) == ap_data._len, NULL);
+            std::strncpy( path, ap_data._path, ap_data._len + 1 );
+            __TBB_ASSERT( path[ap_data._len] == 0, NULL );
+            std::strncat( path, name, len - ap_data._len );
+            __TBB_ASSERT( std::strlen(path) == full_len, NULL );
+        }
+        return full_len+1; // +1 for null character
+    }
+#endif  // __TBB_DYNAMIC_LOAD_ENABLED
+    void init_dynamic_link_data() {
+    #if __TBB_DYNAMIC_LOAD_ENABLED
+        std::call_once( init_dl_data_state, init_dl_data );
+    #endif
+    }
+
+    #if __USE_STATIC_DL_INIT
+    // ap_data structure is initialized with current directory on Linux.
+    // So it should be initialized as soon as possible since the current directory may be changed.
+    // static_init_ap_data object provides this initialization during library loading.
+    static struct static_init_dl_data_t {
+        static_init_dl_data_t() {
+            init_dynamic_link_data();
+        }
+    } static_init_dl_data;
+    #endif
+
+    #if __TBB_WEAK_SYMBOLS_PRESENT
+    static bool weak_symbol_link( const dynamic_link_descriptor descriptors[], std::size_t required )
+    {
+        // Check if the required entries are present in what was loaded into our process.
+        for ( std::size_t k = 0; k < required; ++k )
+            if ( !descriptors[k].ptr )
+                return false;
+        // Commit the entry points.
+        for ( std::size_t k = 0; k < required; ++k )
+            *descriptors[k].handler = (pointer_to_handler) descriptors[k].ptr;
+        return true;
+    }
+    #else
+    static bool weak_symbol_link( const dynamic_link_descriptor[], std::size_t ) {
+        return false;
+    }
+    #endif /* __TBB_WEAK_SYMBOLS_PRESENT */
+
+    void dynamic_unlink( dynamic_link_handle handle ) {
+    #if !__TBB_DYNAMIC_LOAD_ENABLED /* only __TBB_WEAK_SYMBOLS_PRESENT is defined */
+        if ( !dlclose ) return;
+    #endif
+        if ( handle ) {
+            dlclose( handle );
+        }
+    }
+
+    void dynamic_unlink_all() {
+    #if __TBB_DYNAMIC_LOAD_ENABLED
+        handles.free();
+    #endif
+    }
+
+    static dynamic_link_handle global_symbols_link( const char* library, const dynamic_link_descriptor descriptors[], std::size_t required ) {
+        dynamic_link_handle library_handle{};
+#if _WIN32
+        bool res = GetModuleHandleEx(0, library, &library_handle);
+        __TBB_ASSERT_EX(res && library_handle || !res && !library_handle, nullptr);
+#else /* _WIN32 */
+    #if !__TBB_DYNAMIC_LOAD_ENABLED /* only __TBB_WEAK_SYMBOLS_PRESENT is defined */
+        if ( !dlopen ) return 0;
+    #endif /* !__TBB_DYNAMIC_LOAD_ENABLED */
+        // RTLD_GLOBAL - to guarantee that old TBB will find the loaded library
+        // RTLD_NOLOAD - not to load the library without the full path
+        library_handle = dlopen(library, RTLD_LAZY | RTLD_GLOBAL | RTLD_NOLOAD);
+#endif /* _WIN32 */
+        if (library_handle) {
+            if (!resolve_symbols(library_handle, descriptors, required)) {
+                dynamic_unlink(library_handle);
+                library_handle = nullptr;
+            }
+        }
+        return library_handle;
+    }
+
+    static void save_library_handle( dynamic_link_handle src, dynamic_link_handle *dst ) {
+        __TBB_ASSERT_EX( src, "The library handle to store must be non-zero" );
+        if ( dst )
+            *dst = src;
+    #if __TBB_DYNAMIC_LOAD_ENABLED
+        else
+            handles.add( src );
+    #endif /* __TBB_DYNAMIC_LOAD_ENABLED */
+    }
+
+    dynamic_link_handle dynamic_load( const char* library, const dynamic_link_descriptor descriptors[], std::size_t required ) {
+        ::tbb::detail::suppress_unused_warning( library, descriptors, required );
+#if __TBB_DYNAMIC_LOAD_ENABLED
+
+        std::size_t const len = PATH_MAX + 1;
+        char path[ len ];
+        std::size_t rc = abs_path( library, path, len );
+        if ( 0 < rc && rc <= len ) {
+#if _WIN32
+            // Prevent Windows from displaying silly message boxes if it fails to load library
+            // (e.g. because of MS runtime problems - one of those crazy manifest related ones)
+            UINT prev_mode = SetErrorMode (SEM_FAILCRITICALERRORS);
+#endif /* _WIN32 */
+            dynamic_link_handle library_handle = dlopen( path, RTLD_NOW | RTLD_GLOBAL );
+#if _WIN32
+            SetErrorMode (prev_mode);
+#endif /* _WIN32 */
+            if( library_handle ) {
+                if( !resolve_symbols( library_handle, descriptors, required ) ) {
+                    // The loaded library does not contain all the expected entry points
+                    dynamic_unlink( library_handle );
+                    library_handle = NULL;
+                }
+            } else
+                DYNAMIC_LINK_WARNING( dl_lib_not_found, path, dlerror() );
+            return library_handle;
+        } else if ( rc>len )
+                DYNAMIC_LINK_WARNING( dl_buff_too_small );
+                // rc == 0 means failing of init_ap_data so the warning has already been issued.
+
+#endif /* __TBB_DYNAMIC_LOAD_ENABLED */
+            return 0;
+    }
+
+    bool dynamic_link( const char* library, const dynamic_link_descriptor descriptors[], std::size_t required, dynamic_link_handle *handle, int flags ) {
+        init_dynamic_link_data();
+
+        // TODO: May global_symbols_link find weak symbols?
+        dynamic_link_handle library_handle = ( flags & DYNAMIC_LINK_GLOBAL ) ? global_symbols_link( library, descriptors, required ) : 0;
+
+        if ( !library_handle && ( flags & DYNAMIC_LINK_LOAD ) )
+            library_handle = dynamic_load( library, descriptors, required );
+
+        if ( !library_handle && ( flags & DYNAMIC_LINK_WEAK ) )
+            return weak_symbol_link( descriptors, required );
+
+        if ( library_handle ) {
+            save_library_handle( library_handle, handle );
+            return true;
+        }
+        return false;
+    }
+
+#endif /*__TBB_WIN8UI_SUPPORT*/
+#else /* __TBB_WEAK_SYMBOLS_PRESENT || __TBB_DYNAMIC_LOAD_ENABLED */
+    bool dynamic_link( const char*, const dynamic_link_descriptor*, std::size_t, dynamic_link_handle *handle, int ) {
+        if ( handle )
+            *handle=0;
+        return false;
+    }
+    void dynamic_unlink( dynamic_link_handle ) {}
+    void dynamic_unlink_all() {}
+#endif /* __TBB_WEAK_SYMBOLS_PRESENT || __TBB_DYNAMIC_LOAD_ENABLED */
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/contrib/libs/tbb/src/tbb/dynamic_link.h b/contrib/libs/tbb/src/tbb/dynamic_link.h
new file mode 100644
index 0000000000..91adcc507c
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/dynamic_link.h
@@ -0,0 +1,115 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_dynamic_link
+#define __TBB_dynamic_link
+
+// Support for dynamic loading entry points from other shared libraries.
+
+#include "oneapi/tbb/detail/_config.h"
+
+#include <atomic>
+#include <mutex>
+
+/** By default, symbols declared and defined here go into namespace tbb::internal.
+    To put them in other namespace, define macros OPEN_INTERNAL_NAMESPACE
+    and CLOSE_INTERNAL_NAMESPACE to override the following default definitions. **/
+
+#include <cstddef>
+#if _WIN32
+#include <Windows.h>
+#endif /* _WIN32 */
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//! Type definition for a pointer to a void somefunc(void)
+typedef void (*pointer_to_handler)();
+
+//! The helper to construct dynamic_link_descriptor structure
+// Double cast through the void* in DLD macro is necessary to
+// prevent warnings from some compilers (g++ 4.1)
+#if __TBB_WEAK_SYMBOLS_PRESENT
+#define DLD(s,h) {#s, (pointer_to_handler*)(void*)(&h), (pointer_to_handler)&s}
+#define DLD_NOWEAK(s,h) {#s, (pointer_to_handler*)(void*)(&h), NULL}
+#else
+#define DLD(s,h) {#s, (pointer_to_handler*)(void*)(&h)}
+#define DLD_NOWEAK(s,h) DLD(s,h)
+#endif /* __TBB_WEAK_SYMBOLS_PRESENT */
+//! Association between a handler name and location of pointer to it.
+struct dynamic_link_descriptor {
+    //! Name of the handler
+    const char* name;
+    //! Pointer to the handler
+    pointer_to_handler* handler;
+#if __TBB_WEAK_SYMBOLS_PRESENT
+    //! Weak symbol
+    pointer_to_handler ptr;
+#endif
+};
+
+#if _WIN32
+using dynamic_link_handle = HMODULE;
+#else
+using dynamic_link_handle = void*;
+#endif /* _WIN32 */
+
+const int DYNAMIC_LINK_GLOBAL = 0x01;
+const int DYNAMIC_LINK_LOAD   = 0x02;
+const int DYNAMIC_LINK_WEAK   = 0x04;
+const int DYNAMIC_LINK_ALL    = DYNAMIC_LINK_GLOBAL | DYNAMIC_LINK_LOAD | DYNAMIC_LINK_WEAK;
+
+//! Fill in dynamically linked handlers.
+/** 'library' is the name of the requested library. It should not contain a full
+    path since dynamic_link adds the full path (from which the runtime itself
+    was loaded) to the library name.
+    'required' is the number of the initial entries in the array descriptors[]
+    that have to be found in order for the call to succeed. If the library and
+    all the required handlers are found, then the corresponding handler
+    pointers are set, and the return value is true.  Otherwise the original
+    array of descriptors is left untouched and the return value is false.
+    'required' is limited by 20 (exceeding of this value will result in failure
+    to load the symbols and the return value will be false).
+    'handle' is the handle of the library if it is loaded. Otherwise it is left
+    untouched.
+    'flags' is the set of DYNAMIC_LINK_* flags. Each of the DYNAMIC_LINK_* flags
+    allows its corresponding linking stage.
+**/
+bool dynamic_link( const char* library,
+                   const dynamic_link_descriptor descriptors[],
+                   std::size_t required,
+                   dynamic_link_handle* handle = 0,
+                   int flags = DYNAMIC_LINK_ALL );
+
+void dynamic_unlink( dynamic_link_handle handle );
+
+void dynamic_unlink_all();
+
+enum dynamic_link_error_t {
+    dl_success = 0,
+    dl_lib_not_found,     // char const * lib, dlerr_t err
+    dl_sym_not_found,     // char const * sym, dlerr_t err
+                          // Note: dlerr_t depends on OS: it is char const * on Linux* and macOS*, int on Windows*.
+    dl_sys_fail,          // char const * func, int err
+    dl_buff_too_small     // none
+}; // dynamic_link_error_t
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB_dynamic_link */
diff --git a/contrib/libs/tbb/src/tbb/environment.h b/contrib/libs/tbb/src/tbb/environment.h
new file mode 100644
index 0000000000..8886ef09e1
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/environment.h
@@ -0,0 +1,81 @@
+/*
+    Copyright (c) 2018-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_tbb_environment_H
+#define __TBB_tbb_environment_H
+
+#include <cstdlib>
+#include <cstring>
+#include <cerrno>
+#include <cctype>
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if __TBB_WIN8UI_SUPPORT
+static inline bool GetBoolEnvironmentVariable( const char * ) {
+    return false;
+}
+
+static inline long GetIntegralEnvironmentVariable( const char * ) {
+    return -1;
+}
+#else  /* __TBB_WIN8UI_SUPPORT */
+static inline bool GetBoolEnvironmentVariable( const char * name ) {
+    if ( const char* s = std::getenv(name) ) {
+        // The result is defined as true only if the environment variable contains
+        // no characters except one '1' character and an arbitrary number of spaces
+        // (including the absence of spaces).
+        size_t index = std::strspn(s, " ");
+        if (s[index] != '1') return false;
+        index++;
+        // Memory access after incrementing is safe, since the getenv() returns a
+        // NULL terminated string, and even if the character getting by index is '1',
+        // and this character is the end of string, after incrementing we will get
+        // an index of character, that contains '\0'
+        index += std::strspn(&s[index], " ");
+        return !s[index];
+    }
+    return false;
+}
+
+static inline long GetIntegralEnvironmentVariable( const char * name ) {
+    if ( const char* s = std::getenv(name) ) {
+        char* end = NULL;
+        errno = 0;
+        long value = std::strtol(s, &end, 10);
+
+        // We have exceeded the range, value is negative or string is incovertable
+        if ( errno == ERANGE || value < 0 || end==s ) {
+            return -1;
+        }
+        for ( ; *end != '\0'; end++ ) {
+            if ( !std::isspace(*end) ) {
+                return -1;
+            }
+        }
+        return value;
+    }
+    return -1;
+}
+#endif /* __TBB_WIN8UI_SUPPORT */
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_tbb_environment_H
diff --git a/contrib/libs/tbb/src/tbb/exception.cpp b/contrib/libs/tbb/src/tbb/exception.cpp
new file mode 100644
index 0000000000..c3e95d6d97
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/exception.cpp
@@ -0,0 +1,162 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "oneapi/tbb/detail/_exception.h"
+#include "oneapi/tbb/detail/_assert.h"
+#include "oneapi/tbb/detail/_template_helpers.h"
+
+#include <cstring>
+#include <cstdio>
+#include <stdexcept> // std::runtime_error
+#include <new>
+#include <stdexcept>
+
+#define __TBB_STD_RETHROW_EXCEPTION_POSSIBLY_BROKEN                             \
+    (__GLIBCXX__ && __TBB_GLIBCXX_VERSION>=40700 && __TBB_GLIBCXX_VERSION<60000 && TBB_USE_EXCEPTIONS)
+
+#if __TBB_STD_RETHROW_EXCEPTION_POSSIBLY_BROKEN
+// GCC ABI declarations necessary for a workaround
+#include <cxxabi.h>
+#endif
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+const char* bad_last_alloc::what() const noexcept(true) { return "bad allocation in previous or concurrent attempt"; }
+const char* user_abort::what() const noexcept(true) { return "User-initiated abort has terminated this operation"; }
+const char* missing_wait::what() const noexcept(true) { return "wait() was not called on the structured_task_group"; }
+
+#if TBB_USE_EXCEPTIONS
+    template <typename F>
+    /*[[noreturn]]*/ void do_throw_noexcept(F throw_func) noexcept {
+        throw_func();
+    }
+
+    /*[[noreturn]]*/ void do_throw_noexcept(void (*throw_func)()) noexcept {
+        throw_func();
+    }
+
+    bool terminate_on_exception(); // defined in global_control.cpp and ipc_server.cpp
+
+    template <typename F>
+    /*[[noreturn]]*/ void do_throw(F throw_func) {
+        if (terminate_on_exception()) {
+            do_throw_noexcept(throw_func);
+        }
+        throw_func();
+    }
+
+    #define DO_THROW(exc, init_args) do_throw( []{ throw exc init_args; } );
+#else /* !TBB_USE_EXCEPTIONS */
+    #define PRINT_ERROR_AND_ABORT(exc_name, msg) \
+        std::fprintf (stderr, "Exception %s with message %s would have been thrown, "  \
+            "if exception handling had not been disabled. Aborting.\n", exc_name, msg); \
+        std::fflush(stderr); \
+        std::abort();
+    #define DO_THROW(exc, init_args) PRINT_ERROR_AND_ABORT(#exc, #init_args)
+#endif /* !TBB_USE_EXCEPTIONS */
+
+void throw_exception ( exception_id eid ) {
+    switch ( eid ) {
+    case exception_id::bad_alloc: DO_THROW(std::bad_alloc, ()); break;
+    case exception_id::bad_last_alloc: DO_THROW(bad_last_alloc, ()); break;
+    case exception_id::user_abort: DO_THROW( user_abort, () ); break;
+    case exception_id::nonpositive_step: DO_THROW(std::invalid_argument, ("Step must be positive") ); break;
+    case exception_id::out_of_range: DO_THROW(std::out_of_range, ("Index out of requested size range")); break;
+    case exception_id::reservation_length_error: DO_THROW(std::length_error, ("Attempt to exceed implementation defined length limits")); break;
+    case exception_id::missing_wait: DO_THROW(missing_wait, ()); break;
+    case exception_id::invalid_load_factor: DO_THROW(std::out_of_range, ("Invalid hash load factor")); break;
+    case exception_id::invalid_key: DO_THROW(std::out_of_range, ("invalid key")); break;
+    case exception_id::bad_tagged_msg_cast: DO_THROW(std::runtime_error, ("Illegal tagged_msg cast")); break;
+#if __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+    case exception_id::unsafe_wait: DO_THROW(unsafe_wait, ("Unsafe to wait further")); break;
+#endif
+    default: __TBB_ASSERT ( false, "Unknown exception ID" );
+    }
+    __TBB_ASSERT(false, "Unreachable code");
+}
+
+/* The "what" should be fairly short, not more than about 128 characters.
+   Because we control all the call sites to handle_perror, it is pointless
+   to bullet-proof it for very long strings.
+
+   Design note: ADR put this routine off to the side in tbb_misc.cpp instead of
+   Task.cpp because the throw generates a pathetic lot of code, and ADR wanted
+   this large chunk of code to be placed on a cold page. */
+void handle_perror( int error_code, const char* what ) {
+    const int BUF_SIZE = 255;
+    char buf[BUF_SIZE + 1] = { 0 };
+    std::strncat(buf, what, BUF_SIZE);
+    std::size_t buf_len = std::strlen(buf);
+    if (error_code) {
+        std::strncat(buf, ": ", BUF_SIZE - buf_len);
+        buf_len = std::strlen(buf);
+        std::strncat(buf, std::strerror(error_code), BUF_SIZE - buf_len);
+        buf_len = std::strlen(buf);
+    }
+    __TBB_ASSERT(buf_len <= BUF_SIZE && buf[buf_len] == 0, nullptr);
+#if TBB_USE_EXCEPTIONS
+    do_throw([&buf] { throw std::runtime_error(buf); });
+#else
+    PRINT_ERROR_AND_ABORT( "runtime_error", buf);
+#endif /* !TBB_USE_EXCEPTIONS */
+}
+
+#if __TBB_STD_RETHROW_EXCEPTION_POSSIBLY_BROKEN
+// Runtime detection and workaround for the GCC bug 62258.
+// The problem is that std::rethrow_exception() does not increment a counter
+// of active exceptions, causing std::uncaught_exception() to return a wrong value.
+// The code is created after, and roughly reflects, the workaround
+// at https://gcc.gnu.org/bugzilla/attachment.cgi?id=34683
+
+void fix_broken_rethrow() {
+    struct gcc_eh_data {
+        void *       caughtExceptions;
+        unsigned int uncaughtExceptions;
+    };
+    gcc_eh_data* eh_data = punned_cast<gcc_eh_data*>( abi::__cxa_get_globals() );
+    ++eh_data->uncaughtExceptions;
+}
+
+bool gcc_rethrow_exception_broken() {
+    bool is_broken;
+    __TBB_ASSERT( !std::uncaught_exception(),
+        "gcc_rethrow_exception_broken() must not be called when an exception is active" );
+    try {
+        // Throw, catch, and rethrow an exception
+        try {
+            throw __TBB_GLIBCXX_VERSION;
+        } catch(...) {
+            std::rethrow_exception( std::current_exception() );
+        }
+    } catch(...) {
+        // Check the bug presence
+        is_broken = std::uncaught_exception();
+    }
+    if( is_broken ) fix_broken_rethrow();
+    __TBB_ASSERT( !std::uncaught_exception(), NULL );
+    return is_broken;
+}
+#else
+void fix_broken_rethrow() {}
+bool gcc_rethrow_exception_broken() { return false; }
+#endif /* __TBB_STD_RETHROW_EXCEPTION_POSSIBLY_BROKEN */
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
diff --git a/contrib/libs/tbb/src/tbb/global_control.cpp b/contrib/libs/tbb/src/tbb/global_control.cpp
new file mode 100644
index 0000000000..a9eac2cbc3
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/global_control.cpp
@@ -0,0 +1,275 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "oneapi/tbb/detail/_config.h"
+#include "oneapi/tbb/detail/_template_helpers.h"
+
+#include "oneapi/tbb/global_control.h"
+#include "oneapi/tbb/tbb_allocator.h"
+#include "oneapi/tbb/spin_mutex.h"
+
+#include "governor.h"
+#include "market.h"
+#include "misc.h"
+
+#include <atomic>
+#include <set>
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//! Comparator for a set of global_control objects
+struct control_storage_comparator {
+    bool operator()(const global_control* lhs, const global_control* rhs) const;
+};
+
+class control_storage {
+    friend struct global_control_impl;
+    friend std::size_t global_control_active_value(int);
+protected:
+    std::size_t my_active_value{0};
+    std::set<global_control*, control_storage_comparator, tbb_allocator<global_control*>> my_list{};
+    spin_mutex my_list_mutex{};
+public:
+    virtual std::size_t default_value() const = 0;
+    virtual void apply_active(std::size_t new_active) {
+        my_active_value = new_active;
+    }
+    virtual bool is_first_arg_preferred(std::size_t a, std::size_t b) const {
+        return a>b; // prefer max by default
+    }
+    virtual std::size_t active_value() {
+        spin_mutex::scoped_lock lock(my_list_mutex); // protect my_list.empty() call
+        return !my_list.empty() ? my_active_value : default_value();
+    }
+};
+
+class alignas(max_nfs_size) allowed_parallelism_control : public control_storage {
+    virtual std::size_t default_value() const override {
+        return max(1U, governor::default_num_threads());
+    }
+    virtual bool is_first_arg_preferred(std::size_t a, std::size_t b) const override {
+        return a<b; // prefer min allowed parallelism
+    }
+    virtual void apply_active(std::size_t new_active) override {
+        control_storage::apply_active(new_active);
+        __TBB_ASSERT( my_active_value>=1, NULL );
+        // -1 to take external thread into account
+        market::set_active_num_workers( my_active_value-1 );
+    }
+    virtual std::size_t active_value() override {
+        spin_mutex::scoped_lock lock(my_list_mutex); // protect my_list.empty() call
+        if (my_list.empty())
+            return default_value();
+        // non-zero, if market is active
+        const std::size_t workers = market::max_num_workers();
+        // We can't exceed market's maximal number of workers.
+        // +1 to take external thread into account
+        return workers? min(workers+1, my_active_value): my_active_value;
+    }
+public:
+    std::size_t active_value_if_present() const {
+        return !my_list.empty() ? my_active_value : 0;
+    }
+};
+
+class alignas(max_nfs_size) stack_size_control : public control_storage {
+    virtual std::size_t default_value() const override {
+        return ThreadStackSize;
+    }
+    virtual void apply_active(std::size_t new_active) override {
+        control_storage::apply_active(new_active);
+#if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00)
+        __TBB_ASSERT( false, "For Windows 8 Store* apps we must not set stack size" );
+#endif
+    }
+};
+
+class alignas(max_nfs_size) terminate_on_exception_control : public control_storage {
+    virtual std::size_t default_value() const override {
+        return 0;
+    }
+};
+
+#if __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+class alignas(max_nfs_size) lifetime_control : public control_storage {
+    virtual bool is_first_arg_preferred(std::size_t, std::size_t) const override {
+        return false; // not interested
+    }
+    virtual std::size_t default_value() const override {
+        return 0;
+    }
+    virtual void apply_active(std::size_t new_active) override {
+        if (new_active == 1) {
+            // reserve the market reference
+            market::global_market_mutex_type::scoped_lock lock( market::theMarketMutex );
+            if (market::theMarket) {
+                market::add_ref_unsafe(lock, /*is_public*/ true);
+            }
+        } else if (new_active == 0) { // new_active == 0
+            // release the market reference
+            market::global_market_mutex_type::scoped_lock lock( market::theMarketMutex );
+            if (market::theMarket != nullptr) {
+                lock.release();
+                market::theMarket->release(/*is_public*/ true, /*blocking_terminate*/ false);
+            }
+        }
+        control_storage::apply_active(new_active);
+    }
+
+public:
+    bool is_empty() {
+        spin_mutex::scoped_lock lock(my_list_mutex);
+        return my_list.empty();
+    }
+};
+#endif // __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+
+static allowed_parallelism_control allowed_parallelism_ctl;
+static stack_size_control stack_size_ctl;
+static terminate_on_exception_control terminate_on_exception_ctl;
+#if __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+static lifetime_control lifetime_ctl;
+static control_storage *controls[] = {&allowed_parallelism_ctl, &stack_size_ctl, &terminate_on_exception_ctl, &lifetime_ctl};
+#else
+static control_storage *controls[] = {&allowed_parallelism_ctl, &stack_size_ctl, &terminate_on_exception_ctl};
+#endif // __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+
+//! Comparator for a set of global_control objects
+inline bool control_storage_comparator::operator()(const global_control* lhs, const global_control* rhs) const {
+    __TBB_ASSERT_RELEASE(lhs->my_param < global_control::parameter_max , NULL);
+    return lhs->my_value < rhs->my_value || (lhs->my_value == rhs->my_value && lhs < rhs);
+}
+
+unsigned market::app_parallelism_limit() {
+    return allowed_parallelism_ctl.active_value_if_present();
+}
+
+bool terminate_on_exception() {
+    return global_control::active_value(global_control::terminate_on_exception) == 1;
+}
+
+#if __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+unsigned market::is_lifetime_control_present() {
+    return !lifetime_ctl.is_empty();
+}
+#endif // __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+
+struct global_control_impl {
+private:
+    static bool erase_if_present(control_storage* const c, d1::global_control& gc) {
+        auto it = c->my_list.find(&gc);
+        if (it != c->my_list.end()) {
+            c->my_list.erase(it);
+            return true;
+        }
+        return false;
+    }
+
+public:
+
+    static void create(d1::global_control& gc) {
+        __TBB_ASSERT_RELEASE(gc.my_param < global_control::parameter_max, NULL);
+        control_storage* const c = controls[gc.my_param];
+
+        spin_mutex::scoped_lock lock(c->my_list_mutex);
+        if (c->my_list.empty() || c->is_first_arg_preferred(gc.my_value, c->my_active_value)) {
+            // to guarantee that apply_active() is called with current active value,
+            // calls it here and in internal_destroy() under my_list_mutex
+            c->apply_active(gc.my_value);
+        }
+        c->my_list.insert(&gc);
+    }
+
+    static void destroy(d1::global_control& gc) {
+        __TBB_ASSERT_RELEASE(gc.my_param < global_control::parameter_max, NULL);
+        control_storage* const c = controls[gc.my_param];
+#if __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+        __TBB_ASSERT(gc.my_param == global_control::scheduler_handle || !c->my_list.empty(), NULL);
+#else
+        __TBB_ASSERT(!c->my_list.empty(), NULL);
+#endif // __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+        // Concurrent reading and changing global parameter is possible.
+        spin_mutex::scoped_lock lock(c->my_list_mutex);
+        std::size_t new_active = (std::size_t)(-1), old_active = c->my_active_value;
+
+        if (!erase_if_present(c, gc)) {
+#if __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+            __TBB_ASSERT(gc.my_param == global_control::scheduler_handle , NULL);
+            return;
+#else
+            __TBB_ASSERT(false, "Unreachable code");
+#endif // __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+        }
+        if (c->my_list.empty()) {
+            __TBB_ASSERT(new_active == (std::size_t) - 1, NULL);
+            new_active = c->default_value();
+        } else {
+            new_active = (*c->my_list.begin())->my_value;
+        }
+        if (new_active != old_active) {
+            c->apply_active(new_active);
+        }
+    }
+
+    static bool remove_and_check_if_empty(d1::global_control& gc) {
+        __TBB_ASSERT_RELEASE(gc.my_param < global_control::parameter_max, NULL);
+        control_storage* const c = controls[gc.my_param];
+        __TBB_ASSERT(!c->my_list.empty(), NULL);
+
+        spin_mutex::scoped_lock lock(c->my_list_mutex);
+        erase_if_present(c, gc);
+        return c->my_list.empty();
+    }
+#if TBB_USE_ASSERT
+    static bool is_present(d1::global_control& gc) {
+        __TBB_ASSERT_RELEASE(gc.my_param < global_control::parameter_max, NULL);
+        control_storage* const c = controls[gc.my_param];
+
+        spin_mutex::scoped_lock lock(c->my_list_mutex);
+        auto it = c->my_list.find(&gc);
+        if (it != c->my_list.end()) {
+            return true;
+        }
+        return false;
+    }
+#endif // TBB_USE_ASSERT
+};
+
+void __TBB_EXPORTED_FUNC create(d1::global_control& gc) {
+    global_control_impl::create(gc);
+}
+void __TBB_EXPORTED_FUNC destroy(d1::global_control& gc) {
+    global_control_impl::destroy(gc);
+}
+
+bool remove_and_check_if_empty(d1::global_control& gc) {
+    return global_control_impl::remove_and_check_if_empty(gc);
+}
+#if TBB_USE_ASSERT
+bool is_present(d1::global_control& gc) {
+    return global_control_impl::is_present(gc);
+}
+#endif // TBB_USE_ASSERT
+std::size_t __TBB_EXPORTED_FUNC global_control_active_value(int param) {
+    __TBB_ASSERT_RELEASE(param < global_control::parameter_max, NULL);
+    return controls[param]->active_value();
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/contrib/libs/tbb/src/tbb/governor.cpp b/contrib/libs/tbb/src/tbb/governor.cpp
new file mode 100644
index 0000000000..b75b91a75c
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/governor.cpp
@@ -0,0 +1,526 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "governor.h"
+#include "main.h"
+#include "thread_data.h"
+#include "market.h"
+#include "arena.h"
+#include "dynamic_link.h"
+
+#include "oneapi/tbb/task_group.h"
+#include "oneapi/tbb/global_control.h"
+#include "oneapi/tbb/tbb_allocator.h"
+#include "oneapi/tbb/info.h"
+
+#include "task_dispatcher.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <atomic>
+#include <algorithm>
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+//! global_control.cpp contains definition
+bool remove_and_check_if_empty(d1::global_control& gc);
+bool is_present(d1::global_control& gc);
+#endif // __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+
+namespace rml {
+tbb_server* make_private_server( tbb_client& client );
+} // namespace rml
+
+//------------------------------------------------------------------------
+// governor
+//------------------------------------------------------------------------
+
+void governor::acquire_resources () {
+#if __TBB_USE_POSIX
+    int status = theTLS.create(auto_terminate);
+#else
+    int status = theTLS.create();
+#endif
+    if( status )
+        handle_perror(status, "TBB failed to initialize task scheduler TLS\n");
+    detect_cpu_features(cpu_features);
+    is_rethrow_broken = gcc_rethrow_exception_broken();
+}
+
+void governor::release_resources () {
+    theRMLServerFactory.close();
+    destroy_process_mask();
+
+    __TBB_ASSERT(!(__TBB_InitOnce::initialization_done() && theTLS.get()), "TBB is unloaded while thread data still alive?");
+
+    int status = theTLS.destroy();
+    if( status )
+        runtime_warning("failed to destroy task scheduler TLS: %s", std::strerror(status));
+    dynamic_unlink_all();
+}
+
+rml::tbb_server* governor::create_rml_server ( rml::tbb_client& client ) {
+    rml::tbb_server* server = NULL;
+    if( !UsePrivateRML ) {
+        ::rml::factory::status_type status = theRMLServerFactory.make_server( server, client );
+        if( status != ::rml::factory::st_success ) {
+            UsePrivateRML = true;
+            runtime_warning( "rml::tbb_factory::make_server failed with status %x, falling back on private rml", status );
+        }
+    }
+    if ( !server ) {
+        __TBB_ASSERT( UsePrivateRML, NULL );
+        server = rml::make_private_server( client );
+    }
+    __TBB_ASSERT( server, "Failed to create RML server" );
+    return server;
+}
+
+void governor::one_time_init() {
+    if ( !__TBB_InitOnce::initialization_done() ) {
+        DoOneTimeInitialization();
+    }
+}
+
+/*
+    There is no portable way to get stack base address in Posix, however the modern
+    Linux versions provide pthread_attr_np API that can be used  to obtain thread's
+    stack size and base address. Unfortunately even this function does not provide
+    enough information for the main thread on IA-64 architecture (RSE spill area
+    and memory stack are allocated as two separate discontinuous chunks of memory),
+    and there is no portable way to discern the main and the secondary threads.
+    Thus for macOS* and IA-64 architecture for Linux* OS we use the TBB worker stack size for
+    all threads and use the current stack top as the stack base. This simplified
+    approach is based on the following assumptions:
+    1) If the default stack size is insufficient for the user app needs, the
+    required amount will be explicitly specified by the user at the point of the
+    TBB scheduler initialization (as an argument to tbb::task_scheduler_init
+    constructor).
+    2) When an external thread initializes the scheduler, it has enough space on its
+    stack. Here "enough" means "at least as much as worker threads have".
+    3) If the user app strives to conserve the memory by cutting stack size, it
+    should do this for TBB workers too (as in the #1).
+*/
+static std::uintptr_t get_stack_base(std::size_t stack_size) {
+    // Stacks are growing top-down. Highest address is called "stack base",
+    // and the lowest is "stack limit".
+#if USE_WINTHREAD
+    suppress_unused_warning(stack_size);
+    NT_TIB* pteb = (NT_TIB*)NtCurrentTeb();
+    __TBB_ASSERT(&pteb < pteb->StackBase && &pteb > pteb->StackLimit, "invalid stack info in TEB");
+    return reinterpret_cast<std::uintptr_t>(pteb->StackBase);
+#else /* USE_PTHREAD */
+    // There is no portable way to get stack base address in Posix, so we use
+    // non-portable method (on all modern Linux) or the simplified approach
+    // based on the common sense assumptions. The most important assumption
+    // is that the main thread's stack size is not less than that of other threads.
+
+    // Points to the lowest addressable byte of a stack.
+    void* stack_limit = nullptr;
+#if __linux__ && !__bg__
+    size_t np_stack_size = 0;
+    pthread_attr_t np_attr_stack;
+    if (0 == pthread_getattr_np(pthread_self(), &np_attr_stack)) {
+        if (0 == pthread_attr_getstack(&np_attr_stack, &stack_limit, &np_stack_size)) {
+            __TBB_ASSERT( &stack_limit > stack_limit, "stack size must be positive" );
+        }
+        pthread_attr_destroy(&np_attr_stack);
+    }
+#endif /* __linux__ */
+    std::uintptr_t stack_base{};
+    if (stack_limit) {
+        stack_base = reinterpret_cast<std::uintptr_t>(stack_limit) + stack_size;
+    } else {
+        // Use an anchor as a base stack address.
+        int anchor{};
+        stack_base = reinterpret_cast<std::uintptr_t>(&anchor);
+    }
+    return stack_base;
+#endif /* USE_PTHREAD */
+}
+
+void governor::init_external_thread() {
+    one_time_init();
+    // Create new scheduler instance with arena
+    int num_slots = default_num_threads();
+    // TODO_REVAMP: support an external thread without an implicit arena
+    int num_reserved_slots = 1;
+    unsigned arena_priority_level = 1; // corresponds to tbb::task_arena::priority::normal
+    std::size_t stack_size = 0;
+    arena& a = *market::create_arena(num_slots, num_reserved_slots, arena_priority_level, stack_size);
+    // We need an internal reference to the market. TODO: is it legacy?
+    market::global_market(false);
+    // External thread always occupies the first slot
+    thread_data& td = *new(cache_aligned_allocate(sizeof(thread_data))) thread_data(0, false);
+    td.attach_arena(a, /*slot index*/ 0);
+
+    stack_size = a.my_market->worker_stack_size();
+    std::uintptr_t stack_base = get_stack_base(stack_size);
+    task_dispatcher& task_disp = td.my_arena_slot->default_task_dispatcher();
+    task_disp.set_stealing_threshold(calculate_stealing_threshold(stack_base, stack_size));
+    td.attach_task_dispatcher(task_disp);
+
+    td.my_arena_slot->occupy();
+    a.my_market->add_external_thread(td);
+    set_thread_data(td);
+}
+
+void governor::auto_terminate(void* tls) {
+    __TBB_ASSERT(get_thread_data_if_initialized() == nullptr ||
+        get_thread_data_if_initialized() == tls, NULL);
+    if (tls) {
+        thread_data* td = static_cast<thread_data*>(tls);
+
+        // Only external thread can be inside an arena during termination.
+        if (td->my_arena_slot) {
+            arena* a = td->my_arena;
+            market* m = a->my_market;
+
+            a->my_observers.notify_exit_observers(td->my_last_observer, td->my_is_worker);
+
+            td->my_task_dispatcher->m_stealing_threshold = 0;
+            td->detach_task_dispatcher();
+            td->my_arena_slot->release();
+            // Release an arena
+            a->on_thread_leaving<arena::ref_external>();
+
+            m->remove_external_thread(*td);
+            // If there was an associated arena, it added a public market reference
+            m->release( /*is_public*/ true, /*blocking_terminate*/ false);
+        }
+
+        td->~thread_data();
+        cache_aligned_deallocate(td);
+
+        clear_thread_data();
+    }
+    __TBB_ASSERT(get_thread_data_if_initialized() == nullptr, NULL);
+}
+
+void governor::initialize_rml_factory () {
+    ::rml::factory::status_type res = theRMLServerFactory.open();
+    UsePrivateRML = res != ::rml::factory::st_success;
+}
+
+#if __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+void __TBB_EXPORTED_FUNC get(d1::task_scheduler_handle& handle) {
+    handle.m_ctl = new(allocate_memory(sizeof(global_control))) global_control(global_control::scheduler_handle, 1);
+}
+
+void release_impl(d1::task_scheduler_handle& handle) {
+    if (handle.m_ctl != nullptr) {
+        handle.m_ctl->~global_control();
+        deallocate_memory(handle.m_ctl);
+        handle.m_ctl = nullptr;
+    }
+}
+
+bool finalize_impl(d1::task_scheduler_handle& handle) {
+    market::global_market_mutex_type::scoped_lock lock( market::theMarketMutex );
+    bool ok = true; // ok if theMarket does not exist yet
+    market* m = market::theMarket; // read the state of theMarket
+    if (m != nullptr) {
+        lock.release();
+        __TBB_ASSERT(is_present(*handle.m_ctl), "finalize or release was already called on this object");
+        thread_data* td = governor::get_thread_data_if_initialized();
+        if (td) {
+            task_dispatcher* task_disp = td->my_task_dispatcher;
+            __TBB_ASSERT(task_disp, nullptr);
+            if (task_disp->m_properties.outermost && !td->my_is_worker) { // is not inside a parallel region
+                governor::auto_terminate(td);
+            }
+        }
+        if (remove_and_check_if_empty(*handle.m_ctl)) {
+            ok = m->release(/*is_public*/ true, /*blocking_terminate*/ true);
+        } else {
+            ok = false;
+        }
+    }
+    return ok;
+}
+
+bool __TBB_EXPORTED_FUNC finalize(d1::task_scheduler_handle& handle, std::intptr_t mode) {
+    if (mode == d1::release_nothrowing) {
+        release_impl(handle);
+        return true;
+    } else {
+        bool ok = finalize_impl(handle);
+        // TODO: it is unsafe when finalize is called concurrently and further library unload
+        release_impl(handle);
+        if (mode == d1::finalize_throwing && !ok) {
+            throw_exception(exception_id::unsafe_wait);
+        }
+        return ok;
+    }
+}
+#endif // __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+
+#if __TBB_ARENA_BINDING
+
+#if __TBB_WEAK_SYMBOLS_PRESENT
+#pragma weak __TBB_internal_initialize_system_topology
+#pragma weak __TBB_internal_allocate_binding_handler
+#pragma weak __TBB_internal_deallocate_binding_handler
+#pragma weak __TBB_internal_apply_affinity
+#pragma weak __TBB_internal_restore_affinity
+#pragma weak __TBB_internal_get_default_concurrency
+
+extern "C" {
+void __TBB_internal_initialize_system_topology(
+    size_t groups_num,
+    int& numa_nodes_count, int*& numa_indexes_list,
+    int& core_types_count, int*& core_types_indexes_list
+);
+
+//TODO: consider renaming to `create_binding_handler` and `destroy_binding_handler`
+binding_handler* __TBB_internal_allocate_binding_handler( int slot_num, int numa_id, int core_type_id, int max_threads_per_core );
+void __TBB_internal_deallocate_binding_handler( binding_handler* handler_ptr );
+
+void __TBB_internal_apply_affinity( binding_handler* handler_ptr, int slot_num );
+void __TBB_internal_restore_affinity( binding_handler* handler_ptr, int slot_num );
+
+int __TBB_internal_get_default_concurrency( int numa_id, int core_type_id, int max_threads_per_core );
+}
+#endif /* __TBB_WEAK_SYMBOLS_PRESENT */
+
+// Stubs that will be used if TBBbind library is unavailable.
+static binding_handler* dummy_allocate_binding_handler ( int, int, int, int ) { return nullptr; }
+static void dummy_deallocate_binding_handler ( binding_handler* ) { }
+static void dummy_apply_affinity ( binding_handler*, int ) { }
+static void dummy_restore_affinity ( binding_handler*, int ) { }
+static int dummy_get_default_concurrency( int, int, int ) { return governor::default_num_threads(); }
+
+// Handlers for communication with TBBbind
+static void (*initialize_system_topology_ptr)(
+    size_t groups_num,
+    int& numa_nodes_count, int*& numa_indexes_list,
+    int& core_types_count, int*& core_types_indexes_list
+) = nullptr;
+
+static binding_handler* (*allocate_binding_handler_ptr)( int slot_num, int numa_id, int core_type_id, int max_threads_per_core )
+    = dummy_allocate_binding_handler;
+static void (*deallocate_binding_handler_ptr)( binding_handler* handler_ptr )
+    = dummy_deallocate_binding_handler;
+static void (*apply_affinity_ptr)( binding_handler* handler_ptr, int slot_num )
+    = dummy_apply_affinity;
+static void (*restore_affinity_ptr)( binding_handler* handler_ptr, int slot_num )
+    = dummy_restore_affinity;
+int (*get_default_concurrency_ptr)( int numa_id, int core_type_id, int max_threads_per_core )
+    = dummy_get_default_concurrency;
+
+#if _WIN32 || _WIN64 || __linux__
+// Table describing how to link the handlers.
+static const dynamic_link_descriptor TbbBindLinkTable[] = {
+    DLD(__TBB_internal_initialize_system_topology, initialize_system_topology_ptr),
+    DLD(__TBB_internal_allocate_binding_handler, allocate_binding_handler_ptr),
+    DLD(__TBB_internal_deallocate_binding_handler, deallocate_binding_handler_ptr),
+    DLD(__TBB_internal_apply_affinity, apply_affinity_ptr),
+    DLD(__TBB_internal_restore_affinity, restore_affinity_ptr),
+    DLD(__TBB_internal_get_default_concurrency, get_default_concurrency_ptr)
+};
+
+static const unsigned LinkTableSize = sizeof(TbbBindLinkTable) / sizeof(dynamic_link_descriptor);
+
+#if TBB_USE_DEBUG
+#define DEBUG_SUFFIX "_debug"
+#else
+#define DEBUG_SUFFIX
+#endif /* TBB_USE_DEBUG */
+
+#if _WIN32 || _WIN64
+#define LIBRARY_EXTENSION ".dll"
+#define LIBRARY_PREFIX
+#elif __linux__
+#define LIBRARY_EXTENSION __TBB_STRING(.so.3)
+#define LIBRARY_PREFIX "lib"
+#endif /* __linux__ */
+
+#define TBBBIND_NAME LIBRARY_PREFIX "tbbbind" DEBUG_SUFFIX LIBRARY_EXTENSION
+#define TBBBIND_2_0_NAME LIBRARY_PREFIX "tbbbind_2_0" DEBUG_SUFFIX LIBRARY_EXTENSION
+#define TBBBIND_2_4_NAME LIBRARY_PREFIX "tbbbind_2_4" DEBUG_SUFFIX LIBRARY_EXTENSION
+#endif /* _WIN32 || _WIN64 || __linux__ */
+
+// Representation of system hardware topology information on the TBB side.
+// System topology may be initialized by third-party component (e.g. hwloc)
+// or just filled in with default stubs.
+namespace system_topology {
+
+constexpr int automatic = -1;
+
+static std::atomic<do_once_state> initialization_state;
+
+namespace {
+int  numa_nodes_count = 0;
+int* numa_nodes_indexes = nullptr;
+
+int  core_types_count = 0;
+int* core_types_indexes = nullptr;
+
+const char* load_tbbbind_shared_object() {
+#if _WIN32 || _WIN64 || __linux__
+#if _WIN32 && !_WIN64
+    // For 32-bit Windows applications, process affinity masks can only support up to 32 logical CPUs.
+    SYSTEM_INFO si;
+    GetNativeSystemInfo(&si);
+    if (si.dwNumberOfProcessors > 32) return nullptr;
+#endif /* _WIN32 && !_WIN64 */
+    for (const auto& tbbbind_version : {TBBBIND_2_4_NAME, TBBBIND_2_0_NAME, TBBBIND_NAME}) {
+        if (dynamic_link(tbbbind_version, TbbBindLinkTable, LinkTableSize)) {
+            return tbbbind_version;
+        }
+    }
+#endif /* _WIN32 || _WIN64 || __linux__ */
+    return nullptr;
+}
+
+int processor_groups_num() {
+#if _WIN32
+    return NumberOfProcessorGroups();
+#else
+    // Stub to improve code readability by reducing number of the compile-time conditions
+    return 1;
+#endif
+}
+} // internal namespace
+
+// Tries to load TBBbind library API, if success, gets NUMA topology information from it,
+// in another case, fills NUMA topology by stubs.
+void initialization_impl() {
+    governor::one_time_init();
+
+    if (const char* tbbbind_name = load_tbbbind_shared_object()) {
+        initialize_system_topology_ptr(
+            processor_groups_num(),
+            numa_nodes_count, numa_nodes_indexes,
+            core_types_count, core_types_indexes
+        );
+
+        PrintExtraVersionInfo("TBBBIND", tbbbind_name);
+        return;
+    }
+
+    static int dummy_index = automatic;
+
+    numa_nodes_count = 1;
+    numa_nodes_indexes = &dummy_index;
+
+    core_types_count = 1;
+    core_types_indexes = &dummy_index;
+
+    PrintExtraVersionInfo("TBBBIND", "UNAVAILABLE");
+}
+
+void initialize() {
+    atomic_do_once(initialization_impl, initialization_state);
+}
+} // namespace system_topology
+
+binding_handler* construct_binding_handler(int slot_num, int numa_id, int core_type_id, int max_threads_per_core) {
+    system_topology::initialize();
+    return allocate_binding_handler_ptr(slot_num, numa_id, core_type_id, max_threads_per_core);
+}
+
+void destroy_binding_handler(binding_handler* handler_ptr) {
+    __TBB_ASSERT(deallocate_binding_handler_ptr, "tbbbind loading was not performed");
+    deallocate_binding_handler_ptr(handler_ptr);
+}
+
+void apply_affinity_mask(binding_handler* handler_ptr, int slot_index) {
+    __TBB_ASSERT(slot_index >= 0, "Negative thread index");
+    __TBB_ASSERT(apply_affinity_ptr, "tbbbind loading was not performed");
+    apply_affinity_ptr(handler_ptr, slot_index);
+}
+
+void restore_affinity_mask(binding_handler* handler_ptr, int slot_index) {
+    __TBB_ASSERT(slot_index >= 0, "Negative thread index");
+    __TBB_ASSERT(restore_affinity_ptr, "tbbbind loading was not performed");
+    restore_affinity_ptr(handler_ptr, slot_index);
+}
+
+unsigned __TBB_EXPORTED_FUNC numa_node_count() {
+    system_topology::initialize();
+    return system_topology::numa_nodes_count;
+}
+
+void __TBB_EXPORTED_FUNC fill_numa_indices(int* index_array) {
+    system_topology::initialize();
+    std::memcpy(index_array, system_topology::numa_nodes_indexes, system_topology::numa_nodes_count * sizeof(int));
+}
+
+int __TBB_EXPORTED_FUNC numa_default_concurrency(int node_id) {
+    if (node_id >= 0) {
+        system_topology::initialize();
+        int result = get_default_concurrency_ptr(
+            node_id,
+            /*core_type*/system_topology::automatic,
+            /*threads_per_core*/system_topology::automatic
+        );
+        if (result > 0) return result;
+    }
+    return governor::default_num_threads();
+}
+
+unsigned __TBB_EXPORTED_FUNC core_type_count(intptr_t /*reserved*/) {
+    system_topology::initialize();
+    return system_topology::core_types_count;
+}
+
+void __TBB_EXPORTED_FUNC fill_core_type_indices(int* index_array, intptr_t /*reserved*/) {
+    system_topology::initialize();
+    std::memcpy(index_array, system_topology::core_types_indexes, system_topology::core_types_count * sizeof(int));
+}
+
+void constraints_assertion(d1::constraints c) {
+    bool is_topology_initialized = system_topology::initialization_state == do_once_state::initialized;
+    __TBB_ASSERT_RELEASE(c.max_threads_per_core == system_topology::automatic || c.max_threads_per_core > 0,
+        "Wrong max_threads_per_core constraints field value.");
+
+    auto numa_nodes_begin = system_topology::numa_nodes_indexes;
+    auto numa_nodes_end = system_topology::numa_nodes_indexes + system_topology::numa_nodes_count;
+    __TBB_ASSERT_RELEASE(
+        c.numa_id == system_topology::automatic ||
+        (is_topology_initialized && std::find(numa_nodes_begin, numa_nodes_end, c.numa_id) != numa_nodes_end),
+        "The constraints::numa_id value is not known to the library. Use tbb::info::numa_nodes() to get the list of possible values.");
+
+    int* core_types_begin = system_topology::core_types_indexes;
+    int* core_types_end = system_topology::core_types_indexes + system_topology::core_types_count;
+    __TBB_ASSERT_RELEASE(c.core_type == system_topology::automatic ||
+        (is_topology_initialized && std::find(core_types_begin, core_types_end, c.core_type) != core_types_end),
+        "The constraints::core_type value is not known to the library. Use tbb::info::core_types() to get the list of possible values.");
+}
+
+int __TBB_EXPORTED_FUNC constraints_default_concurrency(const d1::constraints& c, intptr_t /*reserved*/) {
+    constraints_assertion(c);
+
+    if (c.numa_id >= 0 || c.core_type >= 0 || c.max_threads_per_core > 0) {
+        system_topology::initialize();
+        return get_default_concurrency_ptr(c.numa_id, c.core_type, c.max_threads_per_core);
+    }
+    return governor::default_num_threads();
+}
+
+int __TBB_EXPORTED_FUNC constraints_threads_per_core(const d1::constraints&, intptr_t /*reserved*/) {
+    return system_topology::automatic;
+}
+#endif /* __TBB_ARENA_BINDING */
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/contrib/libs/tbb/src/tbb/governor.h b/contrib/libs/tbb/src/tbb/governor.h
new file mode 100644
index 0000000000..0ff4781414
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/governor.h
@@ -0,0 +1,158 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_governor_H
+#define _TBB_governor_H
+
+#include "rml_tbb.h"
+
+#include "misc.h" // for AvailableHwConcurrency
+#include "tls.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class market;
+class thread_data;
+class __TBB_InitOnce;
+
+#if __TBB_USE_ITT_NOTIFY
+//! Defined in profiling.cpp
+extern bool ITT_Present;
+#endif
+
+typedef std::size_t stack_size_type;
+
+//------------------------------------------------------------------------
+// Class governor
+//------------------------------------------------------------------------
+
+//! The class handles access to the single instance of market, and to TLS to keep scheduler instances.
+/** It also supports automatic on-demand initialization of the TBB scheduler.
+    The class contains only static data members and methods.*/
+class governor {
+private:
+    friend class __TBB_InitOnce;
+    friend class market;
+
+    // TODO: consider using thread_local (measure performance and side effects)
+    //! TLS for scheduler instances associated with individual threads
+    static basic_tls<thread_data*> theTLS;
+
+    //! Caches the maximal level of parallelism supported by the hardware
+    static unsigned DefaultNumberOfThreads;
+
+    //! Caches the size of OS regular memory page
+    static std::size_t DefaultPageSize;
+
+    // TODO (TBB_REVAMP_TODO): reconsider constant names
+    static rml::tbb_factory theRMLServerFactory;
+
+    static bool UsePrivateRML;
+
+    // Flags for runtime-specific conditions
+    static cpu_features_type cpu_features;
+    static bool is_rethrow_broken;
+
+    //! Create key for thread-local storage and initialize RML.
+    static void acquire_resources ();
+
+    //! Destroy the thread-local storage key and deinitialize RML.
+    static void release_resources ();
+
+    static rml::tbb_server* create_rml_server ( rml::tbb_client& );
+
+public:
+    static unsigned default_num_threads () {
+        // No memory fence required, because at worst each invoking thread calls AvailableHwConcurrency once.
+        return DefaultNumberOfThreads ? DefaultNumberOfThreads :
+                                        DefaultNumberOfThreads = AvailableHwConcurrency();
+    }
+    static std::size_t default_page_size () {
+        return DefaultPageSize ? DefaultPageSize :
+                                 DefaultPageSize = DefaultSystemPageSize();
+    }
+    static void one_time_init();
+    //! Processes scheduler initialization request (possibly nested) in an external thread
+    /** If necessary creates new instance of arena and/or local scheduler.
+        The auto_init argument specifies if the call is due to automatic initialization. **/
+    static void init_external_thread();
+
+    //! The routine to undo automatic initialization.
+    /** The signature is written with void* so that the routine
+        can be the destructor argument to pthread_key_create. */
+    static void auto_terminate(void* tls);
+
+    //! Obtain the thread-local instance of the thread data.
+    /** If the scheduler has not been initialized yet, initialization is done automatically.
+        Note that auto-initialized scheduler instance is destroyed only when its thread terminates. **/
+    static thread_data* get_thread_data() {
+        thread_data* td = theTLS.get();
+        if (td) {
+            return td;
+        }
+        init_external_thread();
+        td = theTLS.get();
+        __TBB_ASSERT(td, NULL);
+        return td;
+    }
+
+    static void set_thread_data(thread_data& td) {
+        theTLS.set(&td);
+    }
+
+    static void clear_thread_data() {
+        theTLS.set(nullptr);
+    }
+
+    static thread_data* get_thread_data_if_initialized () {
+        return theTLS.get();
+    }
+
+    static bool is_thread_data_set(thread_data* td) {
+        return theTLS.get() == td;
+    }
+
+    //! Undo automatic initialization if necessary; call when a thread exits.
+    static void terminate_external_thread() {
+        auto_terminate(get_thread_data_if_initialized());
+    }
+
+    static void initialize_rml_factory ();
+
+    static bool does_client_join_workers (const rml::tbb_client &client);
+
+    static bool speculation_enabled() { return cpu_features.rtm_enabled; }
+
+    static bool wait_package_enabled() { return cpu_features.waitpkg_enabled; }
+
+    static bool rethrow_exception_broken() { return is_rethrow_broken; }
+
+    static bool is_itt_present() {
+#if __TBB_USE_ITT_NOTIFY
+        return ITT_Present;
+#else
+        return false;
+#endif
+    }
+}; // class governor
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_governor_H */
diff --git a/contrib/libs/tbb/src/tbb/intrusive_list.h b/contrib/libs/tbb/src/tbb/intrusive_list.h
new file mode 100644
index 0000000000..699bc149aa
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/intrusive_list.h
@@ -0,0 +1,242 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_intrusive_list_H
+#define _TBB_intrusive_list_H
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//! Data structure to be inherited by the types that can form intrusive lists.
+/** Intrusive list is formed by means of the member_intrusive_list<T> template class.
+    Note that type T must derive from intrusive_list_node either publicly or
+    declare instantiation member_intrusive_list<T> as a friend.
+    This class implements a limited subset of std::list interface. **/
+struct intrusive_list_node {
+    intrusive_list_node* my_prev_node{};
+    intrusive_list_node* my_next_node{};
+#if TBB_USE_ASSERT
+    intrusive_list_node() { my_prev_node = my_next_node = this; }
+#endif /* TBB_USE_ASSERT */
+};
+
+//! List of element of type T, where T is derived from intrusive_list_node
+/** The class is not thread safe. **/
+template <class List, class T>
+class intrusive_list_base {
+    //! Pointer to the head node
+    intrusive_list_node my_head;
+
+    //! Number of list elements
+    std::size_t my_size;
+
+    static intrusive_list_node& node ( T& item ) { return List::node(item); }
+
+    static T& item ( intrusive_list_node* node ) { return List::item(node); }
+
+    static const T& item( const intrusive_list_node* node ) { return List::item(node); }
+
+    template <typename DereferenceType>
+    class iterator_impl {
+        static_assert(std::is_same<DereferenceType, T>::value ||
+                      std::is_same<DereferenceType, const T>::value,
+                      "Incorrect DereferenceType in iterator_impl");
+
+        using pointer_type = typename std::conditional<std::is_same<DereferenceType, T>::value,
+                                                       intrusive_list_node*,
+                                                       const intrusive_list_node*>::type;
+
+    public:
+        iterator_impl() : my_pos(nullptr) {}
+
+        iterator_impl( pointer_type pos ) : my_pos(pos) {}
+
+        iterator_impl& operator++() {
+            my_pos = my_pos->my_next_node;
+            return *this;
+        }
+
+        iterator_impl operator++( int ) {
+            iterator_impl it(*this);
+            ++*this;
+            return it;
+        }
+
+        iterator_impl& operator--() {
+            my_pos = my_pos->my_prev_node;
+            return *this;
+        }
+
+        iterator_impl operator--( int ) {
+            iterator_impl it(*this);
+            --*this;
+            return it;
+        }
+
+        bool operator==( const iterator_impl& rhs ) const {
+            return my_pos == rhs.my_pos;
+        }
+
+        bool operator!=( const iterator_impl& rhs ) const {
+            return my_pos != rhs.my_pos;
+        }
+
+        DereferenceType& operator*() const {
+            return intrusive_list_base::item(my_pos);
+        }
+
+        DereferenceType* operator->() const {
+            return &intrusive_list_base::item(my_pos);
+        }
+    private:
+        // Node the iterator points to at the moment
+        pointer_type my_pos;
+    }; // class iterator_impl
+
+    void assert_ok () const {
+        __TBB_ASSERT( (my_head.my_prev_node == &my_head && !my_size) ||
+                      (my_head.my_next_node != &my_head && my_size >0), "intrusive_list_base corrupted" );
+#if TBB_USE_ASSERT >= 2
+        std::size_t i = 0;
+        for ( intrusive_list_node *n = my_head.my_next_node; n != &my_head; n = n->my_next_node )
+            ++i;
+        __TBB_ASSERT( my_size == i, "Wrong size" );
+#endif /* TBB_USE_ASSERT >= 2 */
+    }
+
+public:
+    using iterator = iterator_impl<T>;
+    using const_iterator = iterator_impl<const T>;
+
+    intrusive_list_base () : my_size(0) {
+        my_head.my_prev_node = &my_head;
+        my_head.my_next_node = &my_head;
+    }
+
+    bool empty () const { return my_head.my_next_node == &my_head; }
+
+    std::size_t size () const { return my_size; }
+
+    iterator begin () { return iterator(my_head.my_next_node); }
+
+    iterator end () { return iterator(&my_head); }
+
+    const_iterator begin () const { return const_iterator(my_head.my_next_node); }
+
+    const_iterator end () const { return const_iterator(&my_head); }
+
+    void push_front ( T& val ) {
+        __TBB_ASSERT( node(val).my_prev_node == &node(val) && node(val).my_next_node == &node(val),
+                    "Object with intrusive list node can be part of only one intrusive list simultaneously" );
+        // An object can be part of only one intrusive list at the given moment via the given node member
+        node(val).my_prev_node = &my_head;
+        node(val).my_next_node = my_head.my_next_node;
+        my_head.my_next_node->my_prev_node = &node(val);
+        my_head.my_next_node = &node(val);
+        ++my_size;
+        assert_ok();
+    }
+
+    void remove( T& val ) {
+        __TBB_ASSERT( node(val).my_prev_node != &node(val) && node(val).my_next_node != &node(val), "Element to remove is not in the list" );
+        __TBB_ASSERT( node(val).my_prev_node->my_next_node == &node(val) && node(val).my_next_node->my_prev_node == &node(val), "Element to remove is not in the list" );
+        --my_size;
+        node(val).my_next_node->my_prev_node = node(val).my_prev_node;
+        node(val).my_prev_node->my_next_node = node(val).my_next_node;
+#if TBB_USE_ASSERT
+        node(val).my_prev_node = node(val).my_next_node = &node(val);
+#endif
+        assert_ok();
+    }
+
+    iterator erase ( iterator it ) {
+        T& val = *it;
+        ++it;
+        remove( val );
+        return it;
+    }
+
+}; // intrusive_list_base
+
+#if __TBB_TODO
+// With standard compliant compilers memptr_intrusive_list could be named simply intrusive_list,
+// and inheritance based intrusive_list version would become its partial specialization.
+// Here are the corresponding declarations:
+
+struct dummy_intrusive_list_item { intrusive_list_node my_node; };
+
+template <class T, class U = dummy_intrusive_list_item, intrusive_list_node U::*NodePtr = &dummy_intrusive_list_item::my_node>
+class intrusive_list : public intrusive_list_base<intrusive_list<T, U, NodePtr>, T>;
+
+template <class T>
+class intrusive_list<T, dummy_intrusive_list_item, &dummy_intrusive_list_item::my_node>
+    : public intrusive_list_base<intrusive_list<T>, T>;
+
+#endif /* __TBB_TODO */
+
+//! Double linked list of items of type T containing a member of type intrusive_list_node.
+/** NodePtr is a member pointer to the node data field. Class U is either T or
+    a base class of T containing the node member. Default values exist for the sake
+    of a partial specialization working with inheritance case.
+
+    The list does not have ownership of its items. Its purpose is to avoid dynamic
+    memory allocation when forming lists of existing objects.
+
+    The class is not thread safe. **/
+template <class T, class U, intrusive_list_node U::*NodePtr>
+class memptr_intrusive_list : public intrusive_list_base<memptr_intrusive_list<T, U, NodePtr>, T>
+{
+    friend class intrusive_list_base<memptr_intrusive_list<T, U, NodePtr>, T>;
+
+    static intrusive_list_node& node ( T& val ) { return val.*NodePtr; }
+
+    static T& item ( intrusive_list_node* node ) {
+        // Cannot use __TBB_offsetof (and consequently __TBB_get_object_ref) macro
+        // with *NodePtr argument because gcc refuses to interpret pasted "->" and "*"
+        // as member pointer dereferencing operator, and explicit usage of ## in
+        // __TBB_offsetof implementation breaks operations with normal member names.
+        return *reinterpret_cast<T*>((char*)node - ((ptrdiff_t)&(reinterpret_cast<T*>(0x1000)->*NodePtr) - 0x1000));
+    }
+
+    static const T& item( const intrusive_list_node* node ) {
+        return item(const_cast<intrusive_list_node*>(node));
+    }
+
+}; // intrusive_list<T, U, NodePtr>
+
+//! Double linked list of items of type T that is derived from intrusive_list_node class.
+/** The list does not have ownership of its items. Its purpose is to avoid dynamic
+    memory allocation when forming lists of existing objects.
+
+    The class is not thread safe. **/
+template <class T>
+class intrusive_list : public intrusive_list_base<intrusive_list<T>, T>
+{
+    friend class intrusive_list_base<intrusive_list<T>, T>;
+
+    static intrusive_list_node& node ( T& val ) { return val; }
+
+    static T& item ( intrusive_list_node* node ) { return *static_cast<T*>(node); }
+
+    static const T& item( const intrusive_list_node* node ) { return *static_cast<const T*>(node); }
+}; // intrusive_list<T>
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_intrusive_list_H */
diff --git a/contrib/libs/tbb/src/tbb/itt_notify.cpp b/contrib/libs/tbb/src/tbb/itt_notify.cpp
new file mode 100644
index 0000000000..0e60579a62
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/itt_notify.cpp
@@ -0,0 +1,69 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#if __TBB_USE_ITT_NOTIFY
+
+#if _WIN32||_WIN64
+    #ifndef UNICODE
+        #define UNICODE
+    #endif
+#else
+    #pragma weak dlopen
+    #pragma weak dlsym
+    #pragma weak dlerror
+#endif /* WIN */
+
+#if __TBB_BUILD
+
+extern "C" void ITT_DoOneTimeInitialization();
+#define __itt_init_ittlib_name(x,y) (ITT_DoOneTimeInitialization(), true)
+
+#elif __TBBMALLOC_BUILD
+
+extern "C" void MallocInitializeITT();
+#define __itt_init_ittlib_name(x,y) (MallocInitializeITT(), true)
+
+#else
+#error This file is expected to be used for either TBB or TBB allocator build.
+#endif // __TBB_BUILD
+
+#include "tools_api/ittnotify_static.c"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+/** This extra proxy method is necessary since __itt_init_lib is declared as static **/
+int __TBB_load_ittnotify() {
+#if !(_WIN32||_WIN64)
+    // tool_api crashes without dlopen, check that it's present. Common case
+    // for lack of dlopen is static binaries, i.e. ones build with -static.
+    if (dlopen == NULL)
+        return 0;
+#endif
+    return __itt_init_ittlib(NULL,          // groups for:
+      (__itt_group_id)(__itt_group_sync     // prepare/cancel/acquired/releasing
+                       | __itt_group_thread // name threads
+                       | __itt_group_stitch // stack stitching
+                       | __itt_group_structure
+                           ));
+}
+
+} //namespace r1
+} //namespace detail
+} // namespace tbb
+
+#endif /* __TBB_USE_ITT_NOTIFY */
diff --git a/contrib/libs/tbb/src/tbb/itt_notify.h b/contrib/libs/tbb/src/tbb/itt_notify.h
new file mode 100644
index 0000000000..9978bcd7cb
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/itt_notify.h
@@ -0,0 +1,114 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_ITT_NOTIFY
+#define _TBB_ITT_NOTIFY
+
+#include "oneapi/tbb/detail/_config.h"
+
+#if __TBB_USE_ITT_NOTIFY
+
+#if _WIN32||_WIN64
+    #ifndef UNICODE
+        #define UNICODE
+    #endif
+#endif /* WIN */
+
+#ifndef INTEL_ITTNOTIFY_API_PRIVATE
+#define INTEL_ITTNOTIFY_API_PRIVATE
+#endif
+
+#include "tools_api/ittnotify.h"
+#include "tools_api/legacy/ittnotify.h"
+extern "C" void __itt_fini_ittlib(void);
+
+#if _WIN32||_WIN64
+    #undef _T
+#endif /* WIN */
+
+#endif /* __TBB_USE_ITT_NOTIFY */
+
+#if !ITT_CALLER_NULL
+#define ITT_CALLER_NULL ((__itt_caller)0)
+#endif
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//! Unicode support
+#if (_WIN32||_WIN64) && !__MINGW32__
+    //! Unicode character type. Always wchar_t on Windows.
+    /** We do not use typedefs from Windows TCHAR family to keep consistence of TBB coding style. **/
+    using tchar = wchar_t;
+    //! Standard Windows macro to markup the string literals.
+    #define _T(string_literal) L ## string_literal
+#else /* !WIN */
+    using tchar = char;
+    //! Standard Windows style macro to markup the string literals.
+    #define _T(string_literal) string_literal
+#endif /* !WIN */
+
+//! Display names of internal synchronization types
+extern const tchar
+    *SyncType_Scheduler;
+//! Display names of internal synchronization components/scenarios
+extern const tchar
+    *SyncObj_ContextsList
+    ;
+
+#if __TBB_USE_ITT_NOTIFY
+// const_cast<void*>() is necessary to cast off volatility
+#define ITT_NOTIFY(name,obj)            __itt_##name(const_cast<void*>(static_cast<volatile void*>(obj)))
+#define ITT_THREAD_SET_NAME(name)       __itt_thread_set_name(name)
+#define ITT_FINI_ITTLIB()               __itt_fini_ittlib()
+#define ITT_SYNC_CREATE(obj, type, name) __itt_sync_create((void*)(obj), type, name, 2)
+#define ITT_STACK_CREATE(obj)           obj = __itt_stack_caller_create()
+#define ITT_STACK_DESTROY(obj)          (obj!=nullptr) ? __itt_stack_caller_destroy(static_cast<__itt_caller>(obj)) : ((void)0)
+#define ITT_CALLEE_ENTER(cond, t, obj)  if(cond) {\
+                                            __itt_stack_callee_enter(static_cast<__itt_caller>(obj));\
+                                            __itt_sync_acquired(t);\
+                                        }
+#define ITT_CALLEE_LEAVE(cond, obj)     (cond) ? __itt_stack_callee_leave(static_cast<__itt_caller>(obj)) : ((void)0)
+
+#define ITT_TASK_GROUP(obj,name,parent)     r1::itt_make_task_group(d1::ITT_DOMAIN_MAIN,(void*)(obj),ALGORITHM,(void*)(parent),(parent!=nullptr) ? ALGORITHM : FLOW_NULL,name)
+#define ITT_TASK_BEGIN(obj,name,id)         r1::itt_task_begin(d1::ITT_DOMAIN_MAIN,(void*)(id),ALGORITHM,(void*)(obj),ALGORITHM,name)
+#define ITT_TASK_END                        r1::itt_task_end(d1::ITT_DOMAIN_MAIN)
+
+
+#else /* !__TBB_USE_ITT_NOTIFY */
+
+#define ITT_NOTIFY(name,obj)            ((void)0)
+#define ITT_THREAD_SET_NAME(name)       ((void)0)
+#define ITT_FINI_ITTLIB()               ((void)0)
+#define ITT_SYNC_CREATE(obj, type, name) ((void)0)
+#define ITT_STACK_CREATE(obj)           ((void)0)
+#define ITT_STACK_DESTROY(obj)          ((void)0)
+#define ITT_CALLEE_ENTER(cond, t, obj)  ((void)0)
+#define ITT_CALLEE_LEAVE(cond, obj)     ((void)0)
+#define ITT_TASK_GROUP(type,name,parent)    ((void)0)
+#define ITT_TASK_BEGIN(type,name,id)        ((void)0)
+#define ITT_TASK_END                        ((void)0)
+
+#endif /* !__TBB_USE_ITT_NOTIFY */
+
+int __TBB_load_ittnotify();
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_ITT_NOTIFY */
diff --git a/contrib/libs/tbb/src/tbb/mailbox.h b/contrib/libs/tbb/src/tbb/mailbox.h
new file mode 100644
index 0000000000..2f49e9b35e
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/mailbox.h
@@ -0,0 +1,249 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_mailbox_H
+#define _TBB_mailbox_H
+
+#include "oneapi/tbb/cache_aligned_allocator.h"
+#include "oneapi/tbb/detail/_small_object_pool.h"
+
+#include "arena_slot.h"
+#include "scheduler_common.h"
+
+#include <atomic>
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+struct task_proxy : public d1::task {
+    static const intptr_t      pool_bit = 1<<0;
+    static const intptr_t   mailbox_bit = 1<<1;
+    static const intptr_t location_mask = pool_bit | mailbox_bit;
+    /* All but two low-order bits represent a (task*).
+       Two low-order bits mean:
+       1 = proxy is/was/will be in task pool
+       2 = proxy is/was/will be in mailbox */
+    std::atomic<intptr_t> task_and_tag;
+
+    //! Pointer to next task_proxy in a mailbox
+    std::atomic<task_proxy*> next_in_mailbox;
+
+    //! Mailbox to which this was mailed.
+    mail_outbox* outbox;
+
+    //! Task affinity id which is referenced
+    d1::slot_id slot;
+
+    d1::small_object_allocator allocator;
+
+    //! True if the proxy is stored both in its sender's pool and in the destination mailbox.
+    static bool is_shared ( intptr_t tat ) {
+        return (tat & location_mask) == location_mask;
+    }
+
+    //! Returns a pointer to the encapsulated task or nullptr.
+    static task* task_ptr ( intptr_t tat ) {
+        return (task*)(tat & ~location_mask);
+    }
+
+    //! Returns a pointer to the encapsulated task or nullptr, and frees proxy if necessary.
+    template<intptr_t from_bit>
+    inline task* extract_task () {
+        // __TBB_ASSERT( prefix().extra_state == es_task_proxy, "Normal task misinterpreted as a proxy?" );
+        intptr_t tat = task_and_tag.load(std::memory_order_acquire);
+        __TBB_ASSERT( tat == from_bit || (is_shared(tat) && task_ptr(tat)),
+            "Proxy's tag cannot specify both locations if the proxy "
+            "was retrieved from one of its original locations" );
+        if ( tat != from_bit ) {
+            const intptr_t cleaner_bit = location_mask & ~from_bit;
+            // Attempt to transition the proxy to the "empty" state with
+            // cleaner_bit specifying entity responsible for its eventual freeing.
+            // Explicit cast to void* is to work around a seeming ICC 11.1 bug.
+            if ( task_and_tag.compare_exchange_strong(tat, cleaner_bit) ) {
+                // Successfully grabbed the task, and left new owner with the job of freeing the proxy
+                return task_ptr(tat);
+            }
+        }
+        // Proxied task has already been claimed from another proxy location.
+        __TBB_ASSERT( task_and_tag.load(std::memory_order_relaxed) == from_bit, "Empty proxy cannot contain non-zero task pointer" );
+        return nullptr;
+    }
+
+    virtual task* execute(d1::execution_data&) {
+        __TBB_ASSERT_RELEASE(false, nullptr);
+        return nullptr;
+    }
+    virtual task* cancel(d1::execution_data&) {
+        __TBB_ASSERT_RELEASE(false, nullptr);
+        return nullptr;
+    }
+}; // struct task_proxy
+
+//! Internal representation of mail_outbox, without padding.
+class unpadded_mail_outbox {
+protected:
+    typedef std::atomic<task_proxy*> atomic_proxy_ptr;
+
+    //! Pointer to first task_proxy in mailbox, or nullptr if box is empty.
+    atomic_proxy_ptr my_first;
+
+    //! Pointer to pointer that will point to next item in the queue.  Never nullptr.
+    std::atomic<atomic_proxy_ptr*> my_last;
+
+    //! Owner of mailbox is not executing a task, and has drained its own task pool.
+    std::atomic<bool> my_is_idle;
+};
+
+// TODO: - consider moving to arena slot
+//! Class representing where mail is put.
+/** Padded to occupy a cache line. */
+class mail_outbox : padded<unpadded_mail_outbox> {
+
+    task_proxy* internal_pop( isolation_type isolation ) {
+        task_proxy* curr = my_first.load(std::memory_order_acquire);
+        if ( !curr )
+            return nullptr;
+        atomic_proxy_ptr* prev_ptr = &my_first;
+        if ( isolation != no_isolation ) {
+            while ( task_accessor::isolation(*curr) != isolation ) {
+                prev_ptr = &curr->next_in_mailbox;
+                // The next_in_mailbox should be read with acquire to guarantee (*curr) consistency.
+                curr = curr->next_in_mailbox.load(std::memory_order_acquire);
+                if ( !curr )
+                    return nullptr;
+            }
+        }
+        // There is a first item in the mailbox.  See if there is a second.
+        // The next_in_mailbox should be read with acquire to guarantee (*second) consistency.
+        if ( task_proxy* second = curr->next_in_mailbox.load(std::memory_order_acquire) ) {
+            // There are at least two items, so first item can be popped easily.
+            prev_ptr->store(second, std::memory_order_relaxed);
+        } else {
+            // There is only one item. Some care is required to pop it.
+
+            prev_ptr->store(nullptr, std::memory_order_relaxed);
+            atomic_proxy_ptr* expected = &curr->next_in_mailbox;
+            if ( my_last.compare_exchange_strong( expected, prev_ptr ) ) {
+                // Successfully transitioned mailbox from having one item to having none.
+                __TBB_ASSERT( !curr->next_in_mailbox.load(std::memory_order_relaxed), nullptr);
+            } else {
+                // Some other thread updated my_last but has not filled in first->next_in_mailbox
+                // Wait until first item points to second item.
+                atomic_backoff backoff;
+                // The next_in_mailbox should be read with acquire to guarantee (*second) consistency.
+                while ( !(second = curr->next_in_mailbox.load(std::memory_order_acquire)) ) backoff.pause();
+                prev_ptr->store( second, std::memory_order_relaxed);
+            }
+        }
+        assert_pointer_valid(curr);
+        return curr;
+    }
+public:
+    friend class mail_inbox;
+
+    //! Push task_proxy onto the mailbox queue of another thread.
+    /** Implementation is wait-free. */
+    void push( task_proxy* t ) {
+        assert_pointer_valid(t);
+        t->next_in_mailbox.store(nullptr, std::memory_order_relaxed);
+        atomic_proxy_ptr* const link = my_last.exchange(&t->next_in_mailbox);
+        // Logically, the release fence is not required because the exchange above provides the
+        // release-acquire semantic that guarantees that (*t) will be consistent when another thread
+        // loads the link atomic. However, C++11 memory model guarantees consistency of(*t) only
+        // when the same atomic is used for synchronization.
+        link->store(t, std::memory_order_release);
+    }
+
+    //! Return true if mailbox is empty
+    bool empty() {
+        return my_first.load(std::memory_order_relaxed) == nullptr;
+    }
+
+    //! Construct *this as a mailbox from zeroed memory.
+    /** Raise assertion if *this is not previously zeroed, or sizeof(this) is wrong.
+        This method is provided instead of a full constructor since we know the object
+        will be constructed in zeroed memory. */
+    void construct() {
+        __TBB_ASSERT( sizeof(*this)==max_nfs_size, nullptr );
+        __TBB_ASSERT( !my_first.load(std::memory_order_relaxed), nullptr );
+        __TBB_ASSERT( !my_last.load(std::memory_order_relaxed), nullptr );
+        __TBB_ASSERT( !my_is_idle.load(std::memory_order_relaxed), nullptr );
+        my_last = &my_first;
+        suppress_unused_warning(pad);
+    }
+
+    //! Drain the mailbox
+    intptr_t drain() {
+        intptr_t k = 0;
+        // No fences here because other threads have already quit.
+        for( ; task_proxy* t = my_first; ++k ) {
+            my_first.store(t->next_in_mailbox, std::memory_order_relaxed);
+            // cache_aligned_deallocate((char*)t - task_prefix_reservation_size);
+        }
+        return k;
+    }
+
+    //! True if thread that owns this mailbox is looking for work.
+    bool recipient_is_idle() {
+        return my_is_idle.load(std::memory_order_relaxed);
+    }
+}; // class mail_outbox
+
+//! Class representing source of mail.
+class mail_inbox {
+    //! Corresponding sink where mail that we receive will be put.
+    mail_outbox* my_putter;
+public:
+    //! Construct unattached inbox
+    mail_inbox() : my_putter(nullptr) {}
+
+    //! Attach inbox to a corresponding outbox.
+    void attach( mail_outbox& putter ) {
+        my_putter = &putter;
+    }
+    //! Detach inbox from its outbox
+    void detach() {
+        __TBB_ASSERT(my_putter,"not attached");
+        my_putter = nullptr;
+    }
+    //! Get next piece of mail, or nullptr if mailbox is empty.
+    task_proxy* pop( isolation_type isolation ) {
+        return my_putter->internal_pop( isolation );
+    }
+    //! Return true if mailbox is empty
+    bool empty() {
+        return my_putter->empty();
+    }
+    //! Indicate whether thread that reads this mailbox is idle.
+    /** Raises assertion failure if mailbox is redundantly marked as not idle. */
+    void set_is_idle( bool value ) {
+        if( my_putter ) {
+            __TBB_ASSERT( my_putter->my_is_idle.load(std::memory_order_relaxed) || value, "attempt to redundantly mark mailbox as not idle" );
+            my_putter->my_is_idle.store(value, std::memory_order_relaxed);
+        }
+    }
+    //! Indicate whether thread that reads this mailbox is idle.
+    bool is_idle_state ( bool value ) const {
+        return !my_putter || my_putter->my_is_idle.load(std::memory_order_relaxed) == value;
+    }
+}; // class mail_inbox
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_mailbox_H */
diff --git a/contrib/libs/tbb/src/tbb/main.cpp b/contrib/libs/tbb/src/tbb/main.cpp
new file mode 100644
index 0000000000..ec6c98d682
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/main.cpp
@@ -0,0 +1,171 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "oneapi/tbb/detail/_config.h"
+
+#include "main.h"
+#include "governor.h"
+#include "environment.h"
+#include "market.h"
+#include "misc.h"
+#include "itt_notify.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//------------------------------------------------------------------------
+// Begin shared data layout.
+// The following global data items are mostly read-only after initialization.
+//------------------------------------------------------------------------
+
+//------------------------------------------------------------------------
+// governor data
+basic_tls<thread_data*> governor::theTLS;
+unsigned governor::DefaultNumberOfThreads;
+size_t governor::DefaultPageSize;
+rml::tbb_factory governor::theRMLServerFactory;
+bool governor::UsePrivateRML;
+bool governor::is_rethrow_broken;
+
+//------------------------------------------------------------------------
+// market data
+market* market::theMarket;
+market::global_market_mutex_type market::theMarketMutex;
+
+//------------------------------------------------------------------------
+// context propagation data
+context_state_propagation_mutex_type the_context_state_propagation_mutex;
+std::atomic<uintptr_t> the_context_state_propagation_epoch{};
+
+//------------------------------------------------------------------------
+// One time initialization data
+
+//! Counter of references to global shared resources such as TLS.
+std::atomic<int> __TBB_InitOnce::count{};
+
+std::atomic_flag __TBB_InitOnce::InitializationLock = ATOMIC_FLAG_INIT;
+
+//! Flag that is set to true after one-time initializations are done.
+std::atomic<bool> __TBB_InitOnce::InitializationDone{};
+
+#if __TBB_USE_ITT_NOTIFY
+//! Defined in profiling.cpp
+extern bool ITT_Present;
+void ITT_DoUnsafeOneTimeInitialization();
+#endif
+
+#if !(_WIN32||_WIN64) || __TBB_SOURCE_DIRECTLY_INCLUDED
+static __TBB_InitOnce __TBB_InitOnceHiddenInstance;
+#endif
+
+#if TBB_USE_ASSERT
+std::atomic<int> the_observer_proxy_count;
+
+struct check_observer_proxy_count {
+    ~check_observer_proxy_count() {
+        if (the_observer_proxy_count != 0) {
+            runtime_warning("Leaked %ld observer_proxy objects\n", long(the_observer_proxy_count));
+        }
+    }
+};
+// The proxy count checker shall be defined after __TBB_InitOnceHiddenInstance to check the count
+// after auto termination.
+static check_observer_proxy_count the_check_observer_proxy_count;
+#endif /* TBB_USE_ASSERT */
+
+//------------------------------------------------------------------------
+// __TBB_InitOnce
+//------------------------------------------------------------------------
+
+void __TBB_InitOnce::add_ref() {
+    if( ++count==1 )
+        governor::acquire_resources();
+}
+
+void __TBB_InitOnce::remove_ref() {
+    int k = --count;
+    __TBB_ASSERT(k>=0,"removed __TBB_InitOnce ref that was not added?");
+    if( k==0 ) {
+        governor::release_resources();
+        ITT_FINI_ITTLIB();
+    }
+}
+
+//------------------------------------------------------------------------
+// One-time Initializations
+//------------------------------------------------------------------------
+
+//! Defined in cache_aligned_allocator.cpp
+void initialize_cache_aligned_allocator();
+
+//! Performs thread-safe lazy one-time general TBB initialization.
+void DoOneTimeInitialization() {
+    __TBB_InitOnce::lock();
+    // No fence required for load of InitializationDone, because we are inside a critical section.
+    if( !__TBB_InitOnce::InitializationDone ) {
+        __TBB_InitOnce::add_ref();
+        if( GetBoolEnvironmentVariable("TBB_VERSION") )
+            PrintVersion();
+        bool itt_present = false;
+#if __TBB_USE_ITT_NOTIFY
+        ITT_DoUnsafeOneTimeInitialization();
+        itt_present = ITT_Present;
+#endif /* __TBB_USE_ITT_NOTIFY */
+        initialize_cache_aligned_allocator();
+        governor::initialize_rml_factory();
+        // Force processor groups support detection
+        governor::default_num_threads();
+        // Force OS regular page size detection
+        governor::default_page_size();
+        PrintExtraVersionInfo( "TOOLS SUPPORT", itt_present ? "enabled" : "disabled" );
+        __TBB_InitOnce::InitializationDone = true;
+    }
+    __TBB_InitOnce::unlock();
+}
+
+#if (_WIN32||_WIN64) && !__TBB_SOURCE_DIRECTLY_INCLUDED
+//! Windows "DllMain" that handles startup and shutdown of dynamic library.
+extern "C" bool WINAPI DllMain( HANDLE /*hinstDLL*/, DWORD reason, LPVOID lpvReserved ) {
+    switch( reason ) {
+        case DLL_PROCESS_ATTACH:
+            __TBB_InitOnce::add_ref();
+            break;
+        case DLL_PROCESS_DETACH:
+            // Since THREAD_DETACH is not called for the main thread, call auto-termination
+            // here as well - but not during process shutdown (due to risk of a deadlock).
+            if ( lpvReserved==NULL ) { // library unload
+                governor::terminate_external_thread();
+            }
+            __TBB_InitOnce::remove_ref();
+            // It is assumed that InitializationDone is not set after DLL_PROCESS_DETACH,
+            // and thus no race on InitializationDone is possible.
+            if ( __TBB_InitOnce::initialization_done() ) {
+                // Remove reference that we added in DoOneTimeInitialization.
+                __TBB_InitOnce::remove_ref();
+            }
+            break;
+        case DLL_THREAD_DETACH:
+            governor::terminate_external_thread();
+            break;
+    }
+    return true;
+}
+#endif /* (_WIN32||_WIN64) && !__TBB_SOURCE_DIRECTLY_INCLUDED */
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/contrib/libs/tbb/src/tbb/main.h b/contrib/libs/tbb/src/tbb/main.h
new file mode 100644
index 0000000000..c6f54bb47b
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/main.h
@@ -0,0 +1,99 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_main_H
+#define _TBB_main_H
+
+#include "governor.h"
+
+#include <atomic>
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+void DoOneTimeInitialization();
+
+//------------------------------------------------------------------------
+// __TBB_InitOnce
+//------------------------------------------------------------------------
+
+// TODO (TBB_REVAMP_TODO): consider better names
+//! Class that supports TBB initialization.
+/** It handles acquisition and release of global resources (e.g. TLS) during startup and shutdown,
+    as well as synchronization for DoOneTimeInitialization. */
+class __TBB_InitOnce {
+    friend void DoOneTimeInitialization();
+    friend void ITT_DoUnsafeOneTimeInitialization();
+
+    static std::atomic<int> count;
+
+    //! Platform specific code to acquire resources.
+    static void acquire_resources();
+
+    //! Platform specific code to release resources.
+    static void release_resources();
+
+    //! Specifies if the one-time initializations has been done.
+    static std::atomic<bool> InitializationDone;
+
+    //! Global initialization lock
+    /** Scenarios are possible when tools interop has to be initialized before the
+        TBB itself. This imposes a requirement that the global initialization lock
+        has to support valid static initialization, and does not issue any tool
+        notifications in any build mode. **/
+    static std::atomic_flag InitializationLock;
+
+public:
+    static void lock() {
+        tbb::detail::atomic_backoff backoff;
+        while( InitializationLock.test_and_set() ) backoff.pause();
+    }
+
+    static void unlock() { InitializationLock.clear(std::memory_order_release); }
+
+    static bool initialization_done() { return InitializationDone.load(std::memory_order_acquire); }
+
+    //! Add initial reference to resources.
+    /** We assume that dynamic loading of the library prevents any other threads
+        from entering the library until this constructor has finished running. **/
+    __TBB_InitOnce() { add_ref(); }
+
+    //! Remove the initial reference to resources.
+    /** This is not necessarily the last reference if other threads are still running. **/
+    ~__TBB_InitOnce() {
+        governor::terminate_external_thread(); // TLS dtor not called for the main thread
+        remove_ref();
+        // We assume that InitializationDone is not set after file-scope destructors
+        // start running, and thus no race on InitializationDone is possible.
+        if ( initialization_done() ) {
+            // Remove an extra reference that was added in DoOneTimeInitialization.
+            remove_ref();
+        }
+    }
+    //! Add reference to resources.  If first reference added, acquire the resources.
+    static void add_ref();
+
+    //! Remove reference to resources.  If last reference removed, release the resources.
+    static void remove_ref();
+
+}; // class __TBB_InitOnce
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_main_H */
diff --git a/contrib/libs/tbb/src/tbb/market.cpp b/contrib/libs/tbb/src/tbb/market.cpp
new file mode 100644
index 0000000000..9259eaf588
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/market.cpp
@@ -0,0 +1,640 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "oneapi/tbb/global_control.h" // global_control::active_value
+
+#include "market.h"
+#include "main.h"
+#include "governor.h"
+#include "arena.h"
+#include "thread_data.h"
+#include "itt_notify.h"
+
+#include <cstring> // std::memset()
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+/** This method must be invoked under my_arenas_list_mutex. **/
+arena* market::select_next_arena( arena* hint ) {
+    unsigned next_arena_priority_level = num_priority_levels;
+    if ( hint )
+        next_arena_priority_level = hint->my_priority_level;
+    for ( unsigned idx = 0; idx < next_arena_priority_level; ++idx ) {
+        if ( !my_arenas[idx].empty() )
+            return &*my_arenas[idx].begin();
+    }
+    // don't change if arena with higher priority is not found.
+    return hint;
+}
+
+void market::insert_arena_into_list ( arena& a ) {
+    __TBB_ASSERT( a.my_priority_level < num_priority_levels, nullptr );
+    my_arenas[a.my_priority_level].push_front( a );
+    __TBB_ASSERT( !my_next_arena || my_next_arena->my_priority_level < num_priority_levels, nullptr );
+    my_next_arena = select_next_arena( my_next_arena );
+}
+
+void market::remove_arena_from_list ( arena& a ) {
+    __TBB_ASSERT( a.my_priority_level < num_priority_levels, nullptr );
+    my_arenas[a.my_priority_level].remove( a );
+    if ( my_next_arena == &a )
+        my_next_arena = nullptr;
+    my_next_arena = select_next_arena( my_next_arena );
+}
+
+//------------------------------------------------------------------------
+// market
+//------------------------------------------------------------------------
+
+market::market ( unsigned workers_soft_limit, unsigned workers_hard_limit, std::size_t stack_size )
+    : my_num_workers_hard_limit(workers_hard_limit)
+    , my_num_workers_soft_limit(workers_soft_limit)
+    , my_next_arena(nullptr)
+    , my_ref_count(1)
+    , my_stack_size(stack_size)
+    , my_workers_soft_limit_to_report(workers_soft_limit)
+{
+    // Once created RML server will start initializing workers that will need
+    // global market instance to get worker stack size
+    my_server = governor::create_rml_server( *this );
+    __TBB_ASSERT( my_server, "Failed to create RML server" );
+}
+
+static unsigned calc_workers_soft_limit(unsigned workers_soft_limit, unsigned workers_hard_limit) {
+    if( int soft_limit = market::app_parallelism_limit() )
+        workers_soft_limit = soft_limit-1;
+    else // if user set no limits (yet), use market's parameter
+        workers_soft_limit = max( governor::default_num_threads() - 1, workers_soft_limit );
+    if( workers_soft_limit >= workers_hard_limit )
+        workers_soft_limit = workers_hard_limit-1;
+    return workers_soft_limit;
+}
+
+bool market::add_ref_unsafe( global_market_mutex_type::scoped_lock& lock, bool is_public, unsigned workers_requested, std::size_t stack_size ) {
+    market *m = theMarket;
+    if( m ) {
+        ++m->my_ref_count;
+        const unsigned old_public_count = is_public ? m->my_public_ref_count++ : /*any non-zero value*/1;
+        lock.release();
+        if( old_public_count==0 )
+            set_active_num_workers( calc_workers_soft_limit(workers_requested, m->my_num_workers_hard_limit) );
+
+        // do not warn if default number of workers is requested
+        if( workers_requested != governor::default_num_threads()-1 ) {
+            __TBB_ASSERT( skip_soft_limit_warning > workers_requested,
+                          "skip_soft_limit_warning must be larger than any valid workers_requested" );
+            unsigned soft_limit_to_report = m->my_workers_soft_limit_to_report.load(std::memory_order_relaxed);
+            if( soft_limit_to_report < workers_requested ) {
+                runtime_warning( "The number of workers is currently limited to %u. "
+                                 "The request for %u workers is ignored. Further requests for more workers "
+                                 "will be silently ignored until the limit changes.\n",
+                                 soft_limit_to_report, workers_requested );
+                // The race is possible when multiple threads report warnings.
+                // We are OK with that, as there are just multiple warnings.
+                unsigned expected_limit = soft_limit_to_report;
+                m->my_workers_soft_limit_to_report.compare_exchange_strong(expected_limit, skip_soft_limit_warning);
+            }
+
+        }
+        if( m->my_stack_size < stack_size )
+            runtime_warning( "Thread stack size has been already set to %u. "
+                             "The request for larger stack (%u) cannot be satisfied.\n", m->my_stack_size, stack_size );
+        return true;
+    }
+    return false;
+}
+
+market& market::global_market(bool is_public, unsigned workers_requested, std::size_t stack_size) {
+    global_market_mutex_type::scoped_lock lock( theMarketMutex );
+    if( !market::add_ref_unsafe(lock, is_public, workers_requested, stack_size) ) {
+        // TODO: A lot is done under theMarketMutex locked. Can anything be moved out?
+        if( stack_size == 0 )
+            stack_size = global_control::active_value(global_control::thread_stack_size);
+        // Expecting that 4P is suitable for most applications.
+        // Limit to 2P for large thread number.
+        // TODO: ask RML for max concurrency and possibly correct hard_limit
+        const unsigned factor = governor::default_num_threads()<=128? 4 : 2;
+        // The requested number of threads is intentionally not considered in
+        // computation of the hard limit, in order to separate responsibilities
+        // and avoid complicated interactions between global_control and task_scheduler_init.
+        // The market guarantees that at least 256 threads might be created.
+        const unsigned workers_hard_limit = max(max(factor*governor::default_num_threads(), 256u), app_parallelism_limit());
+        const unsigned workers_soft_limit = calc_workers_soft_limit(workers_requested, workers_hard_limit);
+        // Create the global market instance
+        std::size_t size = sizeof(market);
+        __TBB_ASSERT( __TBB_offsetof(market, my_workers) + sizeof(thread_data*) == sizeof(market),
+                      "my_workers must be the last data field of the market class");
+        size += sizeof(thread_data*) * (workers_hard_limit - 1);
+        __TBB_InitOnce::add_ref();
+        void* storage = cache_aligned_allocate(size);
+        std::memset( storage, 0, size );
+        // Initialize and publish global market
+        market* m = new (storage) market( workers_soft_limit, workers_hard_limit, stack_size );
+        if( is_public )
+            m->my_public_ref_count.store(1, std::memory_order_relaxed);
+#if __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+        if (market::is_lifetime_control_present()) {
+            ++m->my_public_ref_count;
+            ++m->my_ref_count;
+        }
+#endif // __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+        theMarket = m;
+        // This check relies on the fact that for shared RML default_concurrency==max_concurrency
+        if ( !governor::UsePrivateRML && m->my_server->default_concurrency() < workers_soft_limit )
+            runtime_warning( "RML might limit the number of workers to %u while %u is requested.\n"
+                    , m->my_server->default_concurrency(), workers_soft_limit );
+    }
+    return *theMarket;
+}
+
+void market::destroy () {
+    this->market::~market(); // qualified to suppress warning
+    cache_aligned_deallocate( this );
+    __TBB_InitOnce::remove_ref();
+}
+
+bool market::release ( bool is_public, bool blocking_terminate ) {
+    market::enforce([this] { return theMarket == this; }, "Global market instance was destroyed prematurely?");
+    bool do_release = false;
+    {
+        global_market_mutex_type::scoped_lock lock( theMarketMutex );
+        if ( blocking_terminate ) {
+            __TBB_ASSERT( is_public, "Only an object with a public reference can request the blocking terminate" );
+            while ( my_public_ref_count.load(std::memory_order_relaxed) == 1 &&
+                    my_ref_count.load(std::memory_order_relaxed) > 1 ) {
+                lock.release();
+                // To guarantee that request_close_connection() is called by the last external thread, we need to wait till all
+                // references are released. Re-read my_public_ref_count to limit waiting if new external threads are created.
+                // Theoretically, new private references to the market can be added during waiting making it potentially
+                // endless.
+                // TODO: revise why the weak scheduler needs market's pointer and try to remove this wait.
+                // Note that the market should know about its schedulers for cancellation/exception/priority propagation,
+                // see e.g. task_group_context::cancel_group_execution()
+                while ( my_public_ref_count.load(std::memory_order_acquire) == 1 &&
+                        my_ref_count.load(std::memory_order_acquire) > 1 ) {
+                    yield();
+                }
+                lock.acquire( theMarketMutex );
+            }
+        }
+        if ( is_public ) {
+            __TBB_ASSERT( theMarket == this, "Global market instance was destroyed prematurely?" );
+            __TBB_ASSERT( my_public_ref_count.load(std::memory_order_relaxed), NULL );
+            --my_public_ref_count;
+        }
+        if ( --my_ref_count == 0 ) {
+            __TBB_ASSERT( !my_public_ref_count.load(std::memory_order_relaxed), NULL );
+            do_release = true;
+            theMarket = NULL;
+        }
+    }
+    if( do_release ) {
+        __TBB_ASSERT( !my_public_ref_count.load(std::memory_order_relaxed),
+            "No public references remain if we remove the market." );
+        // inform RML that blocking termination is required
+        my_join_workers = blocking_terminate;
+        my_server->request_close_connection();
+        return blocking_terminate;
+    }
+    return false;
+}
+
+int market::update_workers_request() {
+    int old_request = my_num_workers_requested;
+    my_num_workers_requested = min(my_total_demand.load(std::memory_order_relaxed),
+                                   (int)my_num_workers_soft_limit.load(std::memory_order_relaxed));
+#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY
+    if (my_mandatory_num_requested > 0) {
+        __TBB_ASSERT(my_num_workers_soft_limit.load(std::memory_order_relaxed) == 0, NULL);
+        my_num_workers_requested = 1;
+    }
+#endif
+    update_allotment(my_num_workers_requested);
+    return my_num_workers_requested - old_request;
+}
+
+void market::set_active_num_workers ( unsigned soft_limit ) {
+    market *m;
+
+    {
+        global_market_mutex_type::scoped_lock lock( theMarketMutex );
+        if ( !theMarket )
+            return; // actual value will be used at market creation
+        m = theMarket;
+        if (m->my_num_workers_soft_limit.load(std::memory_order_relaxed) == soft_limit)
+            return;
+        ++m->my_ref_count;
+    }
+    // have my_ref_count for market, use it safely
+
+    int delta = 0;
+    {
+        arenas_list_mutex_type::scoped_lock lock( m->my_arenas_list_mutex );
+        __TBB_ASSERT(soft_limit <= m->my_num_workers_hard_limit, NULL);
+
+#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY
+        arena_list_type* arenas = m->my_arenas;
+
+        if (m->my_num_workers_soft_limit.load(std::memory_order_relaxed) == 0 &&
+            m->my_mandatory_num_requested > 0)
+        {
+            for (unsigned level = 0; level < num_priority_levels; ++level )
+                for (arena_list_type::iterator it = arenas[level].begin(); it != arenas[level].end(); ++it)
+                    if (it->my_global_concurrency_mode.load(std::memory_order_relaxed))
+                        m->disable_mandatory_concurrency_impl(&*it);
+        }
+        __TBB_ASSERT(m->my_mandatory_num_requested == 0, NULL);
+#endif
+
+        m->my_num_workers_soft_limit.store(soft_limit, std::memory_order_release);
+        // report only once after new soft limit value is set
+        m->my_workers_soft_limit_to_report.store(soft_limit, std::memory_order_relaxed);
+
+#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY
+        if (m->my_num_workers_soft_limit.load(std::memory_order_relaxed) == 0) {
+            for (unsigned level = 0; level < num_priority_levels; ++level )
+                for (arena_list_type::iterator it = arenas[level].begin(); it != arenas[level].end(); ++it)
+                    if (it->has_enqueued_tasks())
+                        m->enable_mandatory_concurrency_impl(&*it);
+        }
+#endif
+
+        delta = m->update_workers_request();
+    }
+    // adjust_job_count_estimate must be called outside of any locks
+    if( delta!=0 )
+        m->my_server->adjust_job_count_estimate( delta );
+    // release internal market reference to match ++m->my_ref_count above
+    m->release( /*is_public=*/false, /*blocking_terminate=*/false );
+}
+
+bool governor::does_client_join_workers (const rml::tbb_client &client) {
+    return ((const market&)client).must_join_workers();
+}
+
+arena* market::create_arena ( int num_slots, int num_reserved_slots, unsigned arena_priority_level,
+                              std::size_t stack_size )
+{
+    __TBB_ASSERT( num_slots > 0, NULL );
+    __TBB_ASSERT( num_reserved_slots <= num_slots, NULL );
+    // Add public market reference for an external thread/task_arena (that adds an internal reference in exchange).
+    market &m = global_market( /*is_public=*/true, num_slots-num_reserved_slots, stack_size );
+    arena& a = arena::allocate_arena( m, num_slots, num_reserved_slots, arena_priority_level );
+    // Add newly created arena into the existing market's list.
+    arenas_list_mutex_type::scoped_lock lock(m.my_arenas_list_mutex);
+    m.insert_arena_into_list(a);
+    return &a;
+}
+
+/** This method must be invoked under my_arenas_list_mutex. **/
+void market::detach_arena ( arena& a ) {
+    market::enforce([this] { return theMarket == this; }, "Global market instance was destroyed prematurely?");
+    __TBB_ASSERT( !a.my_slots[0].is_occupied(), NULL );
+    if (a.my_global_concurrency_mode.load(std::memory_order_relaxed))
+        disable_mandatory_concurrency_impl(&a);
+
+    remove_arena_from_list(a);
+    if (a.my_aba_epoch == my_arenas_aba_epoch.load(std::memory_order_relaxed)) {
+        my_arenas_aba_epoch.store(my_arenas_aba_epoch.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed);
+    }
+}
+
+void market::try_destroy_arena ( arena* a, uintptr_t aba_epoch, unsigned priority_level ) {
+    bool locked = true;
+    __TBB_ASSERT( a, NULL );
+    // we hold reference to the market, so it cannot be destroyed at any moment here
+    market::enforce([this] { return theMarket == this; }, NULL);
+    __TBB_ASSERT( my_ref_count!=0, NULL );
+    my_arenas_list_mutex.lock();
+        arena_list_type::iterator it = my_arenas[priority_level].begin();
+        for ( ; it != my_arenas[priority_level].end(); ++it ) {
+            if ( a == &*it ) {
+                if ( it->my_aba_epoch == aba_epoch ) {
+                    // Arena is alive
+                    if ( !a->my_num_workers_requested && !a->my_references.load(std::memory_order_relaxed) ) {
+                        __TBB_ASSERT(
+                            !a->my_num_workers_allotted.load(std::memory_order_relaxed) &&
+                            (a->my_pool_state == arena::SNAPSHOT_EMPTY || !a->my_max_num_workers),
+                            "Inconsistent arena state"
+                        );
+                        // Arena is abandoned. Destroy it.
+                        detach_arena( *a );
+                        my_arenas_list_mutex.unlock();
+                        locked = false;
+                        a->free_arena();
+                    }
+                }
+                if (locked)
+                    my_arenas_list_mutex.unlock();
+                return;
+            }
+        }
+    my_arenas_list_mutex.unlock();
+}
+
+/** This method must be invoked under my_arenas_list_mutex. **/
+arena* market::arena_in_need ( arena_list_type* arenas, arena* hint ) {
+    // TODO: make sure arena with higher priority returned only if there are available slots in it.
+    hint = select_next_arena( hint );
+    if ( !hint )
+        return nullptr;
+    arena_list_type::iterator it = hint;
+    unsigned curr_priority_level = hint->my_priority_level;
+    __TBB_ASSERT( it != arenas[curr_priority_level].end(), nullptr );
+    do {
+        arena& a = *it;
+        if ( ++it == arenas[curr_priority_level].end() ) {
+            do {
+                ++curr_priority_level %= num_priority_levels;
+            } while ( arenas[curr_priority_level].empty() );
+            it = arenas[curr_priority_level].begin();
+        }
+        if( a.num_workers_active() < a.my_num_workers_allotted.load(std::memory_order_relaxed) ) {
+            a.my_references += arena::ref_worker;
+            return &a;
+        }
+    } while ( it != hint );
+    return nullptr;
+}
+
+arena* market::arena_in_need(arena* prev) {
+    if (my_total_demand.load(std::memory_order_acquire) <= 0)
+        return nullptr;
+    arenas_list_mutex_type::scoped_lock lock(my_arenas_list_mutex, /*is_writer=*/false);
+    // TODO: introduce three state response: alive, not_alive, no_market_arenas
+    if ( is_arena_alive(prev) )
+        return arena_in_need(my_arenas, prev);
+    return arena_in_need(my_arenas, my_next_arena);
+}
+
+int market::update_allotment ( arena_list_type* arenas, int workers_demand, int max_workers ) {
+    __TBB_ASSERT( workers_demand > 0, nullptr );
+    max_workers = min(workers_demand, max_workers);
+    int unassigned_workers = max_workers;
+    int assigned = 0;
+    int carry = 0;
+    unsigned max_priority_level = num_priority_levels;
+    for (unsigned list_idx = 0; list_idx < num_priority_levels; ++list_idx ) {
+        int assigned_per_priority = min(my_priority_level_demand[list_idx], unassigned_workers);
+        unassigned_workers -= assigned_per_priority;
+        for (arena_list_type::iterator it = arenas[list_idx].begin(); it != arenas[list_idx].end(); ++it) {
+            arena& a = *it;
+            __TBB_ASSERT(a.my_num_workers_requested >= 0, nullptr);
+            __TBB_ASSERT(a.my_num_workers_requested <= int(a.my_max_num_workers)
+                || (a.my_max_num_workers == 0 && a.my_local_concurrency_requests > 0 && a.my_num_workers_requested == 1), nullptr);
+            if (a.my_num_workers_requested == 0) {
+                __TBB_ASSERT(!a.my_num_workers_allotted.load(std::memory_order_relaxed), nullptr);
+                continue;
+            }
+
+            if (max_priority_level == num_priority_levels) {
+                max_priority_level = list_idx;
+            }
+
+            int allotted = 0;
+#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY
+            if (my_num_workers_soft_limit.load(std::memory_order_relaxed) == 0) {
+                __TBB_ASSERT(max_workers == 0 || max_workers == 1, nullptr);
+                allotted = a.my_global_concurrency_mode.load(std::memory_order_relaxed) &&
+                    assigned < max_workers ? 1 : 0;
+            } else
+#endif
+            {
+                int tmp = a.my_num_workers_requested * assigned_per_priority + carry;
+                allotted = tmp / my_priority_level_demand[list_idx];
+                carry = tmp % my_priority_level_demand[list_idx];
+                __TBB_ASSERT(allotted <= a.my_num_workers_requested, nullptr);
+                __TBB_ASSERT(allotted <= int(a.my_num_slots - a.my_num_reserved_slots), nullptr);
+            }
+            a.my_num_workers_allotted.store(allotted, std::memory_order_relaxed);
+            a.my_is_top_priority.store(list_idx == max_priority_level, std::memory_order_relaxed);
+            assigned += allotted;
+        }
+    }
+    __TBB_ASSERT( 0 <= assigned && assigned <= max_workers, nullptr );
+    return assigned;
+}
+
+/** This method must be invoked under my_arenas_list_mutex. **/
+bool market::is_arena_in_list( arena_list_type &arenas, arena *a ) {
+    __TBB_ASSERT( a, "Expected non-null pointer to arena." );
+    for ( arena_list_type::iterator it = arenas.begin(); it != arenas.end(); ++it )
+        if ( a == &*it )
+            return true;
+    return false;
+}
+
+/** This method must be invoked under my_arenas_list_mutex. **/
+bool market::is_arena_alive(arena* a) {
+    if ( !a )
+        return false;
+
+    // Still cannot access internals of the arena since the object itself might be destroyed.
+
+    for ( unsigned idx = 0; idx < num_priority_levels; ++idx ) {
+        if ( is_arena_in_list( my_arenas[idx], a ) )
+            return true;
+    }
+    return false;
+}
+
+#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY
+void market::enable_mandatory_concurrency_impl ( arena *a ) {
+    __TBB_ASSERT(!a->my_global_concurrency_mode.load(std::memory_order_relaxed), NULL);
+    __TBB_ASSERT(my_num_workers_soft_limit.load(std::memory_order_relaxed) == 0, NULL);
+
+    a->my_global_concurrency_mode.store(true, std::memory_order_relaxed);
+    my_mandatory_num_requested++;
+}
+
+void market::enable_mandatory_concurrency ( arena *a ) {
+    int delta = 0;
+    {
+        arenas_list_mutex_type::scoped_lock lock(my_arenas_list_mutex);
+        if (my_num_workers_soft_limit.load(std::memory_order_relaxed) != 0 ||
+            a->my_global_concurrency_mode.load(std::memory_order_relaxed))
+            return;
+
+        enable_mandatory_concurrency_impl(a);
+        delta = update_workers_request();
+    }
+
+    if (delta != 0)
+        my_server->adjust_job_count_estimate(delta);
+}
+
+void market::disable_mandatory_concurrency_impl(arena* a) {
+    __TBB_ASSERT(a->my_global_concurrency_mode.load(std::memory_order_relaxed), NULL);
+    __TBB_ASSERT(my_mandatory_num_requested > 0, NULL);
+
+    a->my_global_concurrency_mode.store(false, std::memory_order_relaxed);
+    my_mandatory_num_requested--;
+}
+
+void market::mandatory_concurrency_disable ( arena *a ) {
+    int delta = 0;
+    {
+        arenas_list_mutex_type::scoped_lock lock(my_arenas_list_mutex);
+        if (!a->my_global_concurrency_mode.load(std::memory_order_relaxed))
+            return;
+        // There is a racy window in advertise_new_work between mandtory concurrency enabling and 
+        // setting SNAPSHOT_FULL. It gives a chance to spawn request to disable mandatory concurrency.
+        // Therefore, we double check that there is no enqueued tasks.
+        if (a->has_enqueued_tasks())
+            return;
+
+        __TBB_ASSERT(my_num_workers_soft_limit.load(std::memory_order_relaxed) == 0, NULL);
+        disable_mandatory_concurrency_impl(a);
+
+        delta = update_workers_request();
+    }
+    if (delta != 0)
+        my_server->adjust_job_count_estimate(delta);
+}
+#endif /* __TBB_ENQUEUE_ENFORCED_CONCURRENCY */
+
+void market::adjust_demand ( arena& a, int delta, bool mandatory ) {
+    if (!delta) {
+        return;
+    }
+    int target_epoch{};
+    {
+        arenas_list_mutex_type::scoped_lock lock(my_arenas_list_mutex);
+        __TBB_ASSERT(theMarket != nullptr, "market instance was destroyed prematurely?");
+#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY
+        if (mandatory) {
+            __TBB_ASSERT(delta == 1 || delta == -1, nullptr);
+            // Count the number of mandatory requests and proceed only for 0->1 and 1->0 transitions.
+            a.my_local_concurrency_requests += delta;
+            if ((delta > 0 && a.my_local_concurrency_requests != 1) ||
+                (delta < 0 && a.my_local_concurrency_requests != 0))
+            {
+                return;
+            }
+        }
+#endif
+        a.my_total_num_workers_requested += delta;
+        int target_workers = 0;
+        // Cap target_workers into interval [0, a.my_max_num_workers]
+        if (a.my_total_num_workers_requested > 0) {
+#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY
+            // At least one thread should be requested when mandatory concurrency
+            int max_num_workers = int(a.my_max_num_workers);
+            if (a.my_local_concurrency_requests > 0 && max_num_workers == 0) {
+                max_num_workers = 1;
+            }
+#endif
+            target_workers = min(a.my_total_num_workers_requested, max_num_workers);
+        }
+
+        delta = target_workers - a.my_num_workers_requested;
+
+        if (delta == 0) {
+            return;
+        }
+
+        a.my_num_workers_requested += delta;
+        if (a.my_num_workers_requested == 0) {
+            a.my_num_workers_allotted.store(0, std::memory_order_relaxed);
+        }
+
+        int total_demand = my_total_demand.load(std::memory_order_relaxed) + delta;
+        my_total_demand.store(total_demand, std::memory_order_relaxed);
+        my_priority_level_demand[a.my_priority_level] += delta;
+        unsigned effective_soft_limit = my_num_workers_soft_limit.load(std::memory_order_relaxed);
+        if (my_mandatory_num_requested > 0) {
+            __TBB_ASSERT(effective_soft_limit == 0, NULL);
+            effective_soft_limit = 1;
+        }
+
+        update_allotment(effective_soft_limit);
+        if (delta > 0) {
+            // can't overflow soft_limit, but remember values request by arenas in
+            // my_total_demand to not prematurely release workers to RML
+            if (my_num_workers_requested + delta > (int)effective_soft_limit)
+                delta = effective_soft_limit - my_num_workers_requested;
+        }
+        else {
+            // the number of workers should not be decreased below my_total_demand
+            if (my_num_workers_requested + delta < total_demand)
+                delta = min(total_demand, (int)effective_soft_limit) - my_num_workers_requested;
+        }
+        my_num_workers_requested += delta;
+        __TBB_ASSERT(my_num_workers_requested <= (int)effective_soft_limit, NULL);
+
+        target_epoch = my_adjust_demand_target_epoch++;
+    }
+
+    spin_wait_until_eq(my_adjust_demand_current_epoch, target_epoch);
+    // Must be called outside of any locks
+    my_server->adjust_job_count_estimate( delta );
+    my_adjust_demand_current_epoch.store(target_epoch + 1, std::memory_order_release);
+}
+
+void market::process( job& j ) {
+    thread_data& td = static_cast<thread_data&>(j);
+    // td.my_arena can be dead. Don't access it until arena_in_need is called
+    arena *a = td.my_arena;
+    for (int i = 0; i < 2; ++i) {
+        while ( (a = arena_in_need(a)) ) {
+            a->process(td);
+        }
+        // Workers leave market because there is no arena in need. It can happen earlier than
+        // adjust_job_count_estimate() decreases my_slack and RML can put this thread to sleep.
+        // It might result in a busy-loop checking for my_slack<0 and calling this method instantly.
+        // the yield refines this spinning.
+        if ( !i ) {
+            yield();
+        }
+    }
+}
+
+void market::cleanup( job& j) {
+    market::enforce([this] { return theMarket != this; }, NULL );
+    governor::auto_terminate(&j);
+}
+
+void market::acknowledge_close_connection() {
+    destroy();
+}
+
+::rml::job* market::create_one_job() {
+    unsigned short index = ++my_first_unused_worker_idx;
+    __TBB_ASSERT( index > 0, NULL );
+    ITT_THREAD_SET_NAME(_T("TBB Worker Thread"));
+    // index serves as a hint decreasing conflicts between workers when they migrate between arenas
+    thread_data* td = new(cache_aligned_allocate(sizeof(thread_data))) thread_data{ index, true };
+    __TBB_ASSERT( index <= my_num_workers_hard_limit, NULL );
+    __TBB_ASSERT( my_workers[index - 1] == nullptr, NULL );
+    my_workers[index - 1] = td;
+    return td;
+}
+
+void market::add_external_thread(thread_data& td) {
+    context_state_propagation_mutex_type::scoped_lock lock(the_context_state_propagation_mutex);
+    my_masters.push_front(td);
+}
+
+void market::remove_external_thread(thread_data& td) {
+    context_state_propagation_mutex_type::scoped_lock lock(the_context_state_propagation_mutex);
+    my_masters.remove(td);
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/contrib/libs/tbb/src/tbb/market.h b/contrib/libs/tbb/src/tbb/market.h
new file mode 100644
index 0000000000..8443467447
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/market.h
@@ -0,0 +1,317 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_market_H
+#define _TBB_market_H
+
+#include "scheduler_common.h"
+#include "concurrent_monitor.h"
+#include "intrusive_list.h"
+#include "rml_tbb.h"
+
+#include "oneapi/tbb/spin_rw_mutex.h"
+#include "oneapi/tbb/task_group.h"
+
+#include <atomic>
+
+#if defined(_MSC_VER) && defined(_Wp64)
+    // Workaround for overzealous compiler warnings in /Wp64 mode
+    #pragma warning (push)
+    #pragma warning (disable: 4244)
+#endif
+
+namespace tbb {
+namespace detail {
+
+#if __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+namespace d1 {
+class task_scheduler_handle;
+}
+#endif
+
+namespace r1 {
+
+class task_arena_base;
+class task_group_context;
+
+//------------------------------------------------------------------------
+// Class market
+//------------------------------------------------------------------------
+
+class market : no_copy, rml::tbb_client {
+    friend class arena;
+    friend class task_arena_base;
+    template<typename SchedulerTraits> friend class custom_scheduler;
+    friend class task_group_context;
+    friend class governor;
+#if __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+    friend class lifetime_control;
+#endif
+
+public:
+    //! Keys for the arena map array. The lower the value the higher priority of the arena list.
+    static constexpr unsigned num_priority_levels = 3;
+
+private:
+    friend void ITT_DoUnsafeOneTimeInitialization ();
+#if __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+    friend bool finalize_impl(d1::task_scheduler_handle& handle);
+#endif
+
+    typedef intrusive_list<arena> arena_list_type;
+    typedef intrusive_list<thread_data> thread_data_list_type;
+
+    //! Currently active global market
+    static market* theMarket;
+
+    typedef scheduler_mutex_type global_market_mutex_type;
+
+    //! Mutex guarding creation/destruction of theMarket, insertions/deletions in my_arenas, and cancellation propagation
+    static global_market_mutex_type  theMarketMutex;
+
+    //! Lightweight mutex guarding accounting operations with arenas list
+    typedef spin_rw_mutex arenas_list_mutex_type;
+    // TODO: introduce fine-grained (per priority list) locking of arenas.
+    arenas_list_mutex_type my_arenas_list_mutex;
+
+    //! Pointer to the RML server object that services this TBB instance.
+    rml::tbb_server* my_server;
+
+    //! Waiting object for external and coroutine waiters.
+    extended_concurrent_monitor my_sleep_monitor;
+
+    //! Maximal number of workers allowed for use by the underlying resource manager
+    /** It can't be changed after market creation. **/
+    unsigned my_num_workers_hard_limit;
+
+    //! Current application-imposed limit on the number of workers (see set_active_num_workers())
+    /** It can't be more than my_num_workers_hard_limit. **/
+    std::atomic<unsigned> my_num_workers_soft_limit;
+
+    //! Number of workers currently requested from RML
+    int my_num_workers_requested;
+
+    //! The target serialization epoch for callers of adjust_job_count_estimate
+    int my_adjust_demand_target_epoch;
+
+    //! The current serialization epoch for callers of adjust_job_count_estimate
+    std::atomic<int> my_adjust_demand_current_epoch;
+
+    //! First unused index of worker
+    /** Used to assign indices to the new workers coming from RML, and busy part
+        of my_workers array. **/
+    std::atomic<unsigned> my_first_unused_worker_idx;
+
+    //! Number of workers that were requested by all arenas on all priority levels
+    std::atomic<int> my_total_demand;
+
+    //! Number of workers that were requested by arenas per single priority list item
+    int my_priority_level_demand[num_priority_levels];
+
+#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY
+    //! How many times mandatory concurrency was requested from the market
+    int my_mandatory_num_requested;
+#endif
+
+    //! Per priority list of registered arenas
+    arena_list_type my_arenas[num_priority_levels];
+
+    //! The first arena to be checked when idle worker seeks for an arena to enter
+    /** The check happens in round-robin fashion. **/
+    arena *my_next_arena;
+
+    //! ABA prevention marker to assign to newly created arenas
+    std::atomic<uintptr_t> my_arenas_aba_epoch;
+
+    //! Reference count controlling market object lifetime
+    std::atomic<unsigned> my_ref_count;
+
+    //! Count of external threads attached
+    std::atomic<unsigned> my_public_ref_count;
+
+    //! Stack size of worker threads
+    std::size_t my_stack_size;
+
+    //! Shutdown mode
+    bool my_join_workers;
+
+    //! The value indicating that the soft limit warning is unnecessary
+    static const unsigned skip_soft_limit_warning = ~0U;
+
+    //! Either workers soft limit to be reported via runtime_warning() or skip_soft_limit_warning
+    std::atomic<unsigned> my_workers_soft_limit_to_report;
+
+    //! Constructor
+    market ( unsigned workers_soft_limit, unsigned workers_hard_limit, std::size_t stack_size );
+
+    //! Destroys and deallocates market object created by market::create()
+    void destroy ();
+
+    //! Recalculates the number of workers requested from RML and updates the allotment.
+    int update_workers_request();
+
+    //! Recalculates the number of workers assigned to each arena in the list.
+    /** The actual number of workers servicing a particular arena may temporarily
+        deviate from the calculated value. **/
+    void update_allotment (unsigned effective_soft_limit) {
+        int total_demand = my_total_demand.load(std::memory_order_relaxed);
+        if (total_demand) {
+            update_allotment(my_arenas, total_demand, (int)effective_soft_limit);
+        }
+    }
+
+    //! Returns next arena that needs more workers, or NULL.
+    arena* arena_in_need(arena* prev);
+
+    template <typename Pred>
+    static void enforce (Pred pred, const char* msg) {
+        suppress_unused_warning(pred, msg);
+#if TBB_USE_ASSERT
+        global_market_mutex_type::scoped_lock lock(theMarketMutex);
+        __TBB_ASSERT(pred(), msg);
+#endif
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    // Helpers to unify code branches dependent on priority feature presence
+
+    arena* select_next_arena( arena* hint );
+
+    void insert_arena_into_list ( arena& a );
+
+    void remove_arena_from_list ( arena& a );
+
+    arena* arena_in_need ( arena_list_type* arenas, arena* hint );
+
+    int update_allotment ( arena_list_type* arenas, int total_demand, int max_workers );
+
+    bool is_arena_in_list( arena_list_type& arenas, arena* a );
+
+    bool is_arena_alive( arena* a );
+
+    ////////////////////////////////////////////////////////////////////////////////
+    // Implementation of rml::tbb_client interface methods
+
+    version_type version () const override { return 0; }
+
+    unsigned max_job_count () const override { return my_num_workers_hard_limit; }
+
+    std::size_t min_stack_size () const override { return worker_stack_size(); }
+
+    job* create_one_job () override;
+
+    void cleanup( job& j ) override;
+
+    void acknowledge_close_connection () override;
+
+    void process( job& j ) override;
+
+public:
+    //! Factory method creating new market object
+    static market& global_market( bool is_public, unsigned max_num_workers = 0, std::size_t stack_size = 0 );
+
+    //! Add reference to market if theMarket exists
+    static bool add_ref_unsafe( global_market_mutex_type::scoped_lock& lock, bool is_public, unsigned max_num_workers = 0, std::size_t stack_size = 0 );
+
+    //! Creates an arena object
+    /** If necessary, also creates global market instance, and boosts its ref count.
+        Each call to create_arena() must be matched by the call to arena::free_arena(). **/
+    static arena* create_arena ( int num_slots, int num_reserved_slots,
+                                 unsigned arena_index, std::size_t stack_size );
+
+    //! Removes the arena from the market's list
+    void try_destroy_arena ( arena*, uintptr_t aba_epoch, unsigned pririty_level );
+
+    //! Removes the arena from the market's list
+    void detach_arena ( arena& );
+
+    //! Decrements market's refcount and destroys it in the end
+    bool release ( bool is_public, bool blocking_terminate );
+
+    //! Return wait list
+    extended_concurrent_monitor& get_wait_list() { return my_sleep_monitor; }
+
+#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY
+    //! Imlpementation of mandatory concurrency enabling
+    void enable_mandatory_concurrency_impl ( arena *a );
+
+    //! Inform the external thread that there is an arena with mandatory concurrency
+    void enable_mandatory_concurrency ( arena *a );
+
+    //! Inform the external thread that the arena is no more interested in mandatory concurrency
+    void disable_mandatory_concurrency_impl(arena* a);
+
+    //! Inform the external thread that the arena is no more interested in mandatory concurrency
+    void mandatory_concurrency_disable ( arena *a );
+#endif /* __TBB_ENQUEUE_ENFORCED_CONCURRENCY */
+
+    //! Request that arena's need in workers should be adjusted.
+    /** Concurrent invocations are possible only on behalf of different arenas. **/
+    void adjust_demand ( arena&, int delta, bool mandatory );
+
+    //! Used when RML asks for join mode during workers termination.
+    bool must_join_workers () const { return my_join_workers; }
+
+    //! Returns the requested stack size of worker threads.
+    std::size_t worker_stack_size () const { return my_stack_size; }
+
+    //! Set number of active workers
+    static void set_active_num_workers( unsigned w );
+
+    //! Reports active parallelism level according to user's settings
+    static unsigned app_parallelism_limit();
+
+#if __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
+    //! Reports if any active global lifetime references are present
+    static unsigned is_lifetime_control_present();
+#endif
+
+    //! Finds all contexts affected by the state change and propagates the new state to them.
+    /** The propagation is relayed to the market because tasks created by one
+        external thread can be passed to and executed by other external threads. This means
+        that context trees can span several arenas at once and thus state change
+        propagation cannot be generally localized to one arena only. **/
+    template <typename T>
+    bool propagate_task_group_state (std::atomic<T> d1::task_group_context::*mptr_state, d1::task_group_context& src, T new_state );
+
+    //! List of registered external threads
+    thread_data_list_type my_masters;
+
+    //! Array of pointers to the registered workers
+    /** Used by cancellation propagation mechanism.
+        Must be the last data member of the class market. **/
+    thread_data* my_workers[1];
+
+    static unsigned max_num_workers() {
+        global_market_mutex_type::scoped_lock lock( theMarketMutex );
+        return theMarket? theMarket->my_num_workers_hard_limit : 0;
+    }
+
+    void add_external_thread(thread_data& td);
+
+    void remove_external_thread(thread_data& td);
+}; // class market
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#if defined(_MSC_VER) && defined(_Wp64)
+    // Workaround for overzealous compiler warnings in /Wp64 mode
+    #pragma warning (pop)
+#endif // warning 4244 is back
+
+#endif /* _TBB_market_H */
diff --git a/contrib/libs/tbb/src/tbb/misc.cpp b/contrib/libs/tbb/src/tbb/misc.cpp
new file mode 100644
index 0000000000..0e1d33a596
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/misc.cpp
@@ -0,0 +1,137 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+// Source file for miscellaneous entities that are infrequently referenced by
+// an executing program.
+
+#include "oneapi/tbb/detail/_exception.h"
+#include "oneapi/tbb/detail/_machine.h"
+
+#include "oneapi/tbb/version.h"
+
+#include "misc.h"
+#include "governor.h"
+#include "assert_impl.h" // Out-of-line TBB assertion handling routines are instantiated here.
+
+#include <cstdio>
+#include <cstdlib>
+#include <stdexcept>
+#include <cstring>
+#include <cstdarg>
+
+#if _WIN32||_WIN64
+#include <windows.h>
+#endif
+
+#if !_WIN32
+#include <unistd.h> // sysconf(_SC_PAGESIZE)
+#endif
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//------------------------------------------------------------------------
+// governor data
+//------------------------------------------------------------------------
+cpu_features_type governor::cpu_features;
+
+
+size_t DefaultSystemPageSize() {
+#if _WIN32
+    SYSTEM_INFO si;
+    GetSystemInfo(&si);
+    return si.dwPageSize;
+#else
+    return sysconf(_SC_PAGESIZE);
+#endif
+}
+
+/** The leading "\0" is here so that applying "strings" to the binary delivers a clean result. */
+static const char VersionString[] = "\0" TBB_VERSION_STRINGS;
+
+static bool PrintVersionFlag = false;
+
+void PrintVersion() {
+    PrintVersionFlag = true;
+    std::fputs(VersionString+1,stderr);
+}
+
+void PrintExtraVersionInfo( const char* category, const char* format, ... ) {
+    if( PrintVersionFlag ) {
+        char str[1024]; std::memset(str, 0, 1024);
+        va_list args; va_start(args, format);
+        // Note: correct vsnprintf definition obtained from tbb_assert_impl.h
+        std::vsnprintf( str, 1024-1, format, args);
+        va_end(args);
+        std::fprintf(stderr, "oneTBB: %s\t%s\n", category, str );
+    }
+}
+
+//! check for transaction support.
+#if _MSC_VER
+#include <intrin.h> // for __cpuid
+#endif
+
+#if __TBB_x86_32 || __TBB_x86_64
+void check_cpuid(int leaf, int sub_leaf, int registers[4]) {
+#if _MSC_VER
+    __cpuidex(registers, leaf, sub_leaf);
+#else
+    int reg_eax = 0;
+    int reg_ebx = 0;
+    int reg_ecx = 0;
+    int reg_edx = 0;
+#if __TBB_x86_32 && __PIC__
+    // On 32-bit systems with position-independent code GCC fails to work around the stuff in EBX
+    // register. We help it using backup and restore.
+    __asm__("mov %%ebx, %%esi\n\t"
+            "cpuid\n\t"
+            "xchg %%ebx, %%esi"
+            : "=a"(reg_eax), "=S"(reg_ebx), "=c"(reg_ecx), "=d"(reg_edx)
+            : "0"(leaf), "2"(sub_leaf) // read value from eax and ecx
+    );
+#else
+    __asm__("cpuid"
+            : "=a"(reg_eax), "=b"(reg_ebx), "=c"(reg_ecx), "=d"(reg_edx)
+            : "0"(leaf), "2"(sub_leaf) // read value from eax and ecx
+    );
+#endif
+    registers[0] = reg_eax;
+    registers[1] = reg_ebx;
+    registers[2] = reg_ecx;
+    registers[3] = reg_edx;
+#endif
+}
+#endif
+
+void detect_cpu_features(cpu_features_type& cpu_features) {
+    suppress_unused_warning(cpu_features);
+#if __TBB_x86_32 || __TBB_x86_64
+    const int rtm_ebx_mask = 1 << 11;
+    const int waitpkg_ecx_mask = 1 << 5;
+    int registers[4] = {0};
+
+    // Check RTM and WAITPKG
+    check_cpuid(7, 0, registers);
+    cpu_features.rtm_enabled = (registers[1] & rtm_ebx_mask) != 0;
+    cpu_features.waitpkg_enabled = (registers[2] & waitpkg_ecx_mask) != 0;
+#endif /* (__TBB_x86_32 || __TBB_x86_64) */
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/contrib/libs/tbb/src/tbb/misc.h b/contrib/libs/tbb/src/tbb/misc.h
new file mode 100644
index 0000000000..6a3cf778a4
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/misc.h
@@ -0,0 +1,289 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_tbb_misc_H
+#define _TBB_tbb_misc_H
+
+#include "oneapi/tbb/detail/_config.h"
+#include "oneapi/tbb/detail/_assert.h"
+#include "oneapi/tbb/detail/_utils.h"
+
+#if __TBB_ARENA_BINDING
+#include "oneapi/tbb/info.h"
+#endif /*__TBB_ARENA_BINDING*/
+
+#if __linux__ || __FreeBSD__
+#include <sys/param.h>  // __FreeBSD_version
+#if __FreeBSD_version >= 701000
+#include <sys/cpuset.h>
+#endif
+#endif
+
+#include <atomic>
+
+// Does the operating system have a system call to pin a thread to a set of OS processors?
+#define __TBB_OS_AFFINITY_SYSCALL_PRESENT ((__linux__ && !__ANDROID__) || (__FreeBSD_version >= 701000))
+// On IBM* Blue Gene* CNK nodes, the affinity API has restrictions that prevent its usability for TBB,
+// and also sysconf(_SC_NPROCESSORS_ONLN) already takes process affinity into account.
+#define __TBB_USE_OS_AFFINITY_SYSCALL (__TBB_OS_AFFINITY_SYSCALL_PRESENT && !__bg__)
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+void runtime_warning(const char* format, ... );
+
+#if __TBB_ARENA_BINDING
+class task_arena;
+class task_scheduler_observer;
+#endif /*__TBB_ARENA_BINDING*/
+
+const std::size_t MByte = 1024*1024;
+
+#if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00)
+// In Win8UI mode (Windows 8 Store* applications), TBB uses a thread creation API
+// that does not allow to specify the stack size.
+// Still, the thread stack size value, either explicit or default, is used by the scheduler.
+// So here we set the default value to match the platform's default of 1MB.
+const std::size_t ThreadStackSize = 1*MByte;
+#else
+const std::size_t ThreadStackSize = (sizeof(uintptr_t) <= 4 ? 2 : 4 )*MByte;
+#endif
+
+#ifndef __TBB_HardwareConcurrency
+
+//! Returns maximal parallelism level supported by the current OS configuration.
+int AvailableHwConcurrency();
+
+#else
+
+inline int AvailableHwConcurrency() {
+    int n = __TBB_HardwareConcurrency();
+    return n > 0 ? n : 1; // Fail safety strap
+}
+#endif /* __TBB_HardwareConcurrency */
+
+//! Returns OS regular memory page size
+size_t DefaultSystemPageSize();
+
+//! Returns number of processor groups in the current OS configuration.
+/** AvailableHwConcurrency must be called at least once before calling this method. **/
+int NumberOfProcessorGroups();
+
+#if _WIN32||_WIN64
+
+//! Retrieves index of processor group containing processor with the given index
+int FindProcessorGroupIndex ( int processorIndex );
+
+//! Affinitizes the thread to the specified processor group
+void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex );
+
+#endif /* _WIN32||_WIN64 */
+
+//! Prints TBB version information on stderr
+void PrintVersion();
+
+//! Prints arbitrary extra TBB version information on stderr
+void PrintExtraVersionInfo( const char* category, const char* format, ... );
+
+//! A callback routine to print RML version information on stderr
+void PrintRMLVersionInfo( void* arg, const char* server_info );
+
+// For TBB compilation only; not to be used in public headers
+#if defined(min) || defined(max)
+#undef min
+#undef max
+#endif
+
+//! Utility template function returning lesser of the two values.
+/** Provided here to avoid including not strict safe <algorithm>.\n
+    In case operands cause signed/unsigned or size mismatch warnings it is caller's
+    responsibility to do the appropriate cast before calling the function. **/
+template<typename T>
+T min ( const T& val1, const T& val2 ) {
+    return val1 < val2 ? val1 : val2;
+}
+
+//! Utility template function returning greater of the two values.
+/** Provided here to avoid including not strict safe <algorithm>.\n
+    In case operands cause signed/unsigned or size mismatch warnings it is caller's
+    responsibility to do the appropriate cast before calling the function. **/
+template<typename T>
+T max ( const T& val1, const T& val2 ) {
+    return val1 < val2 ? val2 : val1;
+}
+
+//! Utility helper structure to ease overload resolution
+template<int > struct int_to_type {};
+
+//------------------------------------------------------------------------
+// FastRandom
+//------------------------------------------------------------------------
+
+//! A fast random number generator.
+/** Uses linear congruential method. */
+class FastRandom {
+private:
+    unsigned x, c;
+    static const unsigned a = 0x9e3779b1; // a big prime number
+public:
+    //! Get a random number.
+    unsigned short get() {
+        return get(x);
+    }
+    //! Get a random number for the given seed; update the seed for next use.
+    unsigned short get( unsigned& seed ) {
+        unsigned short r = (unsigned short)(seed>>16);
+        __TBB_ASSERT(c&1, "c must be odd for big rng period");
+        seed = seed*a+c;
+        return r;
+    }
+    //! Construct a random number generator.
+    FastRandom( void* unique_ptr ) { init(uintptr_t(unique_ptr)); }
+
+    template <typename T>
+    void init( T seed ) {
+        init(seed,int_to_type<sizeof(seed)>());
+    }
+    void init( uint64_t seed , int_to_type<8> ) {
+        init(uint32_t((seed>>32)+seed), int_to_type<4>());
+    }
+    void init( uint32_t seed, int_to_type<4> ) {
+        // threads use different seeds for unique sequences
+        c = (seed|1)*0xba5703f5; // c must be odd, shuffle by a prime number
+        x = c^(seed>>1); // also shuffle x for the first get() invocation
+    }
+};
+
+//------------------------------------------------------------------------
+// Atomic extensions
+//------------------------------------------------------------------------
+
+//! Atomically replaces value of dst with newValue if they satisfy condition of compare predicate
+/** Return value semantics is the same as for CAS. **/
+template<typename T1, class Pred>
+T1 atomic_update(std::atomic<T1>& dst, T1 newValue, Pred compare) {
+    T1 oldValue = dst.load(std::memory_order_acquire);
+    while ( compare(oldValue, newValue) ) {
+        if ( dst.compare_exchange_strong(oldValue, newValue) )
+            break;
+    }
+    return oldValue;
+}
+
+#if __TBB_USE_OS_AFFINITY_SYSCALL
+  #if __linux__
+    typedef cpu_set_t basic_mask_t;
+  #elif __FreeBSD_version >= 701000
+    typedef cpuset_t basic_mask_t;
+  #else
+    #error affinity_helper is not implemented in this OS
+  #endif
+    class affinity_helper : no_copy {
+        basic_mask_t* threadMask;
+        int is_changed;
+    public:
+        affinity_helper() : threadMask(NULL), is_changed(0) {}
+        ~affinity_helper();
+        void protect_affinity_mask( bool restore_process_mask  );
+        void dismiss();
+    };
+    void destroy_process_mask();
+#else
+    class affinity_helper : no_copy {
+    public:
+        void protect_affinity_mask( bool ) {}
+        void dismiss() {}
+    };
+    inline void destroy_process_mask(){}
+#endif /* __TBB_USE_OS_AFFINITY_SYSCALL */
+
+struct cpu_features_type {
+    bool rtm_enabled{false};
+    bool waitpkg_enabled{false};
+};
+
+void detect_cpu_features(cpu_features_type& cpu_features);
+
+#if __TBB_ARENA_BINDING
+class binding_handler;
+
+binding_handler* construct_binding_handler(int slot_num, int numa_id, int core_type_id, int max_threads_per_core);
+void destroy_binding_handler(binding_handler* handler_ptr);
+void apply_affinity_mask(binding_handler* handler_ptr, int slot_num);
+void restore_affinity_mask(binding_handler* handler_ptr, int slot_num);
+
+#endif /*__TBB_ARENA_BINDING*/
+
+// RTM specific section
+// abort code for mutexes that detect a conflict with another thread.
+enum {
+    speculation_not_supported       = 0x00,
+    speculation_transaction_aborted = 0x01,
+    speculation_can_retry           = 0x02,
+    speculation_memadd_conflict     = 0x04,
+    speculation_buffer_overflow     = 0x08,
+    speculation_breakpoint_hit      = 0x10,
+    speculation_nested_abort        = 0x20,
+    speculation_xabort_mask         = 0xFF000000,
+    speculation_xabort_shift        = 24,
+    speculation_xabort_not_free     = 0xFF, // The value (0xFF) below comes from the Intel(R) 64 and IA-32 Architectures Optimization Reference Manual 12.4.5 lock not free
+    speculation_successful_begin    = 0xFFFFFFFF,
+    speculation_retry               = speculation_transaction_aborted
+                                      | speculation_can_retry
+                                      | speculation_memadd_conflict
+};
+
+// We suppose that successful transactions are sequentially ordered and
+// do not require additional memory fences around them.
+// Technically it can be achieved only if xbegin has implicit
+// acquire memory semantics an xend/xabort has release memory semantics on compiler and hardware level.
+// See the article: https://arxiv.org/pdf/1710.04839.pdf
+static inline unsigned int begin_transaction() {
+#if __TBB_TSX_INTRINSICS_PRESENT
+    return _xbegin();
+#else
+    return speculation_not_supported; // return unsuccessful code
+#endif
+}
+
+static inline void end_transaction() {
+#if __TBB_TSX_INTRINSICS_PRESENT
+    _xend();
+#endif
+}
+
+static inline void abort_transaction() {
+#if __TBB_TSX_INTRINSICS_PRESENT
+    _xabort(speculation_xabort_not_free);
+#endif
+}
+
+#if TBB_USE_ASSERT
+static inline unsigned char is_in_transaction() {
+#if __TBB_TSX_INTRINSICS_PRESENT
+    return _xtest();
+#else
+    return 0;
+#endif
+}
+#endif // TBB_USE_ASSERT
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_tbb_misc_H */
diff --git a/contrib/libs/tbb/src/tbb/misc_ex.cpp b/contrib/libs/tbb/src/tbb/misc_ex.cpp
new file mode 100644
index 0000000000..177392bb65
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/misc_ex.cpp
@@ -0,0 +1,398 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+// Source file for miscellaneous entities that are infrequently referenced by
+// an executing program, and implementation of which requires dynamic linking.
+
+#include "misc.h"
+
+#if !defined(__TBB_HardwareConcurrency)
+
+#include "dynamic_link.h"
+#include <stdio.h>
+#include <limits.h>
+
+#if _WIN32||_WIN64
+#include <windows.h>
+#if __TBB_WIN8UI_SUPPORT
+#include <thread>
+#endif
+#else
+#include <unistd.h>
+#if __linux__
+#include <sys/sysinfo.h>
+#include <cstring>
+#include <sched.h>
+#include <cerrno>
+#elif __sun
+#include <sys/sysinfo.h>
+#elif __FreeBSD__
+#include <cerrno>
+#include <cstring>
+#include <sys/param.h>  // Required by <sys/cpuset.h>
+#include <sys/cpuset.h>
+#endif
+#endif
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if __TBB_USE_OS_AFFINITY_SYSCALL
+
+#if __linux__
+// Handlers for interoperation with libiomp
+static int (*libiomp_try_restoring_original_mask)();
+// Table for mapping to libiomp entry points
+static const dynamic_link_descriptor iompLinkTable[] = {
+    DLD_NOWEAK( kmp_set_thread_affinity_mask_initial, libiomp_try_restoring_original_mask )
+};
+#endif
+
+static void set_thread_affinity_mask( std::size_t maskSize, const basic_mask_t* threadMask ) {
+#if __linux__
+    if( sched_setaffinity( 0, maskSize, threadMask ) )
+#else /* FreeBSD */
+    if( cpuset_setaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
+#endif
+        // Here and below the error severity is lowered from critical level
+        // because it may happen during TBB library unload because of not
+        // waiting for workers to complete (current RML policy, to be fixed).
+        // handle_perror( errno, "setaffinity syscall" );
+        runtime_warning( "setaffinity syscall failed" );
+}
+
+static void get_thread_affinity_mask( std::size_t maskSize, basic_mask_t* threadMask ) {
+#if __linux__
+    if( sched_getaffinity( 0, maskSize, threadMask ) )
+#else /* FreeBSD */
+    if( cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
+#endif
+    runtime_warning( "getaffinity syscall failed" );
+}
+
+static basic_mask_t* process_mask;
+static int num_masks;
+
+void destroy_process_mask() {
+    if( process_mask ) {
+        delete [] process_mask;
+    }
+}
+
+#define curMaskSize sizeof(basic_mask_t) * num_masks
+affinity_helper::~affinity_helper() {
+    if( threadMask ) {
+        if( is_changed ) {
+            set_thread_affinity_mask( curMaskSize, threadMask );
+        }
+        delete [] threadMask;
+    }
+}
+void affinity_helper::protect_affinity_mask( bool restore_process_mask ) {
+    if( threadMask == NULL && num_masks ) { // TODO: assert num_masks validity?
+        threadMask = new basic_mask_t [num_masks];
+        std::memset( threadMask, 0, curMaskSize );
+        get_thread_affinity_mask( curMaskSize, threadMask );
+        if( restore_process_mask ) {
+            __TBB_ASSERT( process_mask, "A process mask is requested but not yet stored" );
+            is_changed = memcmp( process_mask, threadMask, curMaskSize );
+            if( is_changed )
+                set_thread_affinity_mask( curMaskSize, process_mask );
+        } else {
+            // Assume that the mask will be changed by the caller.
+            is_changed = 1;
+        }
+    }
+}
+void affinity_helper::dismiss() {
+    if( threadMask ) {
+        delete [] threadMask;
+        threadMask = NULL;
+    }
+    is_changed = 0;
+}
+#undef curMaskSize
+
+static std::atomic<do_once_state> hardware_concurrency_info;
+
+static int theNumProcs;
+
+static void initialize_hardware_concurrency_info () {
+    int err;
+    int availableProcs = 0;
+    int numMasks = 1;
+#if __linux__
+    int maxProcs = sysconf(_SC_NPROCESSORS_ONLN);
+    int pid = getpid();
+#else /* FreeBSD >= 7.1 */
+    int maxProcs = sysconf(_SC_NPROCESSORS_ONLN);
+#endif
+    basic_mask_t* processMask;
+    const std::size_t BasicMaskSize =  sizeof(basic_mask_t);
+    for (;;) {
+        const int curMaskSize = BasicMaskSize * numMasks;
+        processMask = new basic_mask_t[numMasks];
+        std::memset( processMask, 0, curMaskSize );
+#if __linux__
+        err = sched_getaffinity( pid, curMaskSize, processMask );
+        if ( !err || errno != EINVAL || curMaskSize * CHAR_BIT >= 256 * 1024 )
+            break;
+#else /* FreeBSD >= 7.1 */
+        // CPU_LEVEL_WHICH - anonymous (current) mask, CPU_LEVEL_CPUSET - assigned mask
+        err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, curMaskSize, processMask );
+        if ( !err || errno != ERANGE || curMaskSize * CHAR_BIT >= 16 * 1024 )
+            break;
+#endif /* FreeBSD >= 7.1 */
+        delete[] processMask;
+        numMasks <<= 1;
+    }
+    if ( !err ) {
+        // We have found the mask size and captured the process affinity mask into processMask.
+        num_masks = numMasks; // do here because it's needed for affinity_helper to work
+#if __linux__
+        // For better coexistence with libiomp which might have changed the mask already,
+        // check for its presence and ask it to restore the mask.
+        dynamic_link_handle libhandle;
+        if ( dynamic_link( "libiomp5.so", iompLinkTable, 1, &libhandle, DYNAMIC_LINK_GLOBAL ) ) {
+            // We have found the symbol provided by libiomp5 for restoring original thread affinity.
+            affinity_helper affhelp;
+            affhelp.protect_affinity_mask( /*restore_process_mask=*/false );
+            if ( libiomp_try_restoring_original_mask()==0 ) {
+                // Now we have the right mask to capture, restored by libiomp.
+                const int curMaskSize = BasicMaskSize * numMasks;
+                std::memset( processMask, 0, curMaskSize );
+                get_thread_affinity_mask( curMaskSize, processMask );
+            } else
+                affhelp.dismiss();  // thread mask has not changed
+            dynamic_unlink( libhandle );
+            // Destructor of affinity_helper restores the thread mask (unless dismissed).
+        }
+#endif
+        for ( int m = 0; availableProcs < maxProcs && m < numMasks; ++m ) {
+            for ( std::size_t i = 0; (availableProcs < maxProcs) && (i < BasicMaskSize * CHAR_BIT); ++i ) {
+                if ( CPU_ISSET( i, processMask + m ) )
+                    ++availableProcs;
+            }
+        }
+        process_mask = processMask;
+    }
+    else {
+        // Failed to get the process affinity mask; assume the whole machine can be used.
+        availableProcs = (maxProcs == INT_MAX) ? sysconf(_SC_NPROCESSORS_ONLN) : maxProcs;
+        delete[] processMask;
+    }
+    theNumProcs = availableProcs > 0 ? availableProcs : 1; // Fail safety strap
+    __TBB_ASSERT( theNumProcs <= sysconf(_SC_NPROCESSORS_ONLN), NULL );
+}
+
+int AvailableHwConcurrency() {
+    atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
+    return theNumProcs;
+}
+
+/* End of __TBB_USE_OS_AFFINITY_SYSCALL implementation */
+#elif __ANDROID__
+
+// Work-around for Android that reads the correct number of available CPUs since system calls are unreliable.
+// Format of "present" file is: ([<int>-<int>|<int>],)+
+int AvailableHwConcurrency() {
+    FILE *fp = fopen("/sys/devices/system/cpu/present", "r");
+    if (fp == NULL) return 1;
+    int num_args, lower, upper, num_cpus=0;
+    while ((num_args = fscanf(fp, "%u-%u", &lower, &upper)) != EOF) {
+        switch(num_args) {
+            case 2: num_cpus += upper - lower + 1; break;
+            case 1: num_cpus += 1; break;
+        }
+        fscanf(fp, ",");
+    }
+    return (num_cpus > 0) ? num_cpus : 1;
+}
+
+#elif defined(_SC_NPROCESSORS_ONLN)
+
+int AvailableHwConcurrency() {
+    int n = sysconf(_SC_NPROCESSORS_ONLN);
+    return (n > 0) ? n : 1;
+}
+
+#elif _WIN32||_WIN64
+
+static std::atomic<do_once_state> hardware_concurrency_info;
+
+static const WORD TBB_ALL_PROCESSOR_GROUPS = 0xffff;
+
+// Statically allocate an array for processor group information.
+// Windows 7 supports maximum 4 groups, but let's look ahead a little.
+static const WORD MaxProcessorGroups = 64;
+
+struct ProcessorGroupInfo {
+    DWORD_PTR   mask;                   ///< Affinity mask covering the whole group
+    int         numProcs;               ///< Number of processors in the group
+    int         numProcsRunningTotal;   ///< Subtotal of processors in this and preceding groups
+
+    //! Total number of processor groups in the system
+    static int NumGroups;
+
+    //! Index of the group with a slot reserved for the first external thread
+    /** In the context of multiple processor groups support current implementation
+        defines "the first external thread" as the first thread to invoke
+        AvailableHwConcurrency().
+
+        TODO:   Implement a dynamic scheme remapping workers depending on the pending
+                external threads affinity. **/
+    static int HoleIndex;
+};
+
+int ProcessorGroupInfo::NumGroups = 1;
+int ProcessorGroupInfo::HoleIndex = 0;
+
+ProcessorGroupInfo theProcessorGroups[MaxProcessorGroups];
+
+struct TBB_GROUP_AFFINITY {
+    DWORD_PTR Mask;
+    WORD   Group;
+    WORD   Reserved[3];
+};
+
+static DWORD (WINAPI *TBB_GetActiveProcessorCount)( WORD groupIndex ) = NULL;
+static WORD (WINAPI *TBB_GetActiveProcessorGroupCount)() = NULL;
+static BOOL (WINAPI *TBB_SetThreadGroupAffinity)( HANDLE hThread,
+                        const TBB_GROUP_AFFINITY* newAff, TBB_GROUP_AFFINITY *prevAff );
+static BOOL (WINAPI *TBB_GetThreadGroupAffinity)( HANDLE hThread, TBB_GROUP_AFFINITY* );
+
+static const dynamic_link_descriptor ProcessorGroupsApiLinkTable[] = {
+      DLD(GetActiveProcessorCount, TBB_GetActiveProcessorCount)
+    , DLD(GetActiveProcessorGroupCount, TBB_GetActiveProcessorGroupCount)
+    , DLD(SetThreadGroupAffinity, TBB_SetThreadGroupAffinity)
+    , DLD(GetThreadGroupAffinity, TBB_GetThreadGroupAffinity)
+};
+
+static void initialize_hardware_concurrency_info () {
+#if __TBB_WIN8UI_SUPPORT
+    // For these applications processor groups info is unavailable
+    // Setting up a number of processors for one processor group
+    theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = std::thread::hardware_concurrency();
+#else /* __TBB_WIN8UI_SUPPORT */
+    dynamic_link( "Kernel32.dll", ProcessorGroupsApiLinkTable,
+                  sizeof(ProcessorGroupsApiLinkTable)/sizeof(dynamic_link_descriptor) );
+    SYSTEM_INFO si;
+    GetNativeSystemInfo(&si);
+    DWORD_PTR pam, sam, m = 1;
+    GetProcessAffinityMask( GetCurrentProcess(), &pam, &sam );
+    int nproc = 0;
+    for ( std::size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1 ) {
+        if ( pam & m )
+            ++nproc;
+    }
+    __TBB_ASSERT( nproc <= (int)si.dwNumberOfProcessors, NULL );
+    // By default setting up a number of processors for one processor group
+    theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = nproc;
+    // Setting up processor groups in case the process does not restrict affinity mask and more than one processor group is present
+    if ( nproc == (int)si.dwNumberOfProcessors && TBB_GetActiveProcessorCount ) {
+        // The process does not have restricting affinity mask and multiple processor groups are possible
+        ProcessorGroupInfo::NumGroups = (int)TBB_GetActiveProcessorGroupCount();
+        __TBB_ASSERT( ProcessorGroupInfo::NumGroups <= MaxProcessorGroups, NULL );
+        // Fail safety bootstrap. Release versions will limit available concurrency
+        // level, while debug ones would assert.
+        if ( ProcessorGroupInfo::NumGroups > MaxProcessorGroups )
+            ProcessorGroupInfo::NumGroups = MaxProcessorGroups;
+        if ( ProcessorGroupInfo::NumGroups > 1 ) {
+            TBB_GROUP_AFFINITY ga;
+            if ( TBB_GetThreadGroupAffinity( GetCurrentThread(), &ga ) )
+                ProcessorGroupInfo::HoleIndex = ga.Group;
+            int nprocs = 0;
+            for ( WORD i = 0; i < ProcessorGroupInfo::NumGroups; ++i ) {
+                ProcessorGroupInfo  &pgi = theProcessorGroups[i];
+                pgi.numProcs = (int)TBB_GetActiveProcessorCount(i);
+                __TBB_ASSERT( pgi.numProcs <= (int)sizeof(DWORD_PTR) * CHAR_BIT, NULL );
+                pgi.mask = pgi.numProcs == sizeof(DWORD_PTR) * CHAR_BIT ? ~(DWORD_PTR)0 : (DWORD_PTR(1) << pgi.numProcs) - 1;
+                pgi.numProcsRunningTotal = nprocs += pgi.numProcs;
+            }
+            __TBB_ASSERT( nprocs == (int)TBB_GetActiveProcessorCount( TBB_ALL_PROCESSOR_GROUPS ), NULL );
+        }
+    }
+#endif /* __TBB_WIN8UI_SUPPORT */
+
+    PrintExtraVersionInfo("Processor groups", "%d", ProcessorGroupInfo::NumGroups);
+    if (ProcessorGroupInfo::NumGroups>1)
+        for (int i=0; i<ProcessorGroupInfo::NumGroups; ++i)
+            PrintExtraVersionInfo( "----- Group", "%d: size %d", i, theProcessorGroups[i].numProcs);
+}
+
+int NumberOfProcessorGroups() {
+    __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "NumberOfProcessorGroups is used before AvailableHwConcurrency" );
+    return ProcessorGroupInfo::NumGroups;
+}
+
+// Offset for the slot reserved for the first external thread
+#define HoleAdjusted(procIdx, grpIdx) (procIdx + (holeIdx <= grpIdx))
+
+int FindProcessorGroupIndex ( int procIdx ) {
+    // In case of oversubscription spread extra workers in a round robin manner
+    int holeIdx;
+    const int numProcs = theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal;
+    if ( procIdx >= numProcs - 1 ) {
+        holeIdx = INT_MAX;
+        procIdx = (procIdx - numProcs + 1) % numProcs;
+    }
+    else
+        holeIdx = ProcessorGroupInfo::HoleIndex;
+    __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "FindProcessorGroupIndex is used before AvailableHwConcurrency" );
+    // Approximate the likely group index assuming all groups are of the same size
+    int i = procIdx / theProcessorGroups[0].numProcs;
+    // Make sure the approximation is a valid group index
+    if (i >= ProcessorGroupInfo::NumGroups) i = ProcessorGroupInfo::NumGroups-1;
+    // Now adjust the approximation up or down
+    if ( theProcessorGroups[i].numProcsRunningTotal > HoleAdjusted(procIdx, i) ) {
+        while ( theProcessorGroups[i].numProcsRunningTotal - theProcessorGroups[i].numProcs > HoleAdjusted(procIdx, i) ) {
+            __TBB_ASSERT( i > 0, NULL );
+            --i;
+        }
+    }
+    else {
+        do {
+            ++i;
+        } while ( theProcessorGroups[i].numProcsRunningTotal <= HoleAdjusted(procIdx, i) );
+    }
+    __TBB_ASSERT( i < ProcessorGroupInfo::NumGroups, NULL );
+    return i;
+}
+
+void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ) {
+    __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "MoveThreadIntoProcessorGroup is used before AvailableHwConcurrency" );
+    if ( !TBB_SetThreadGroupAffinity )
+        return;
+    TBB_GROUP_AFFINITY ga = { theProcessorGroups[groupIndex].mask, (WORD)groupIndex, {0,0,0} };
+    TBB_SetThreadGroupAffinity( hThread, &ga, NULL );
+}
+
+int AvailableHwConcurrency() {
+    atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
+    return theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal;
+}
+
+/* End of _WIN32||_WIN64 implementation */
+#else
+    #error AvailableHwConcurrency is not implemented for this OS
+#endif
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* !__TBB_HardwareConcurrency */
diff --git a/contrib/libs/tbb/src/tbb/observer_proxy.cpp b/contrib/libs/tbb/src/tbb/observer_proxy.cpp
new file mode 100644
index 0000000000..4f7c07c266
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/observer_proxy.cpp
@@ -0,0 +1,322 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "oneapi/tbb/detail/_config.h"
+#include "oneapi/tbb/detail/_utils.h"
+
+#include "observer_proxy.h"
+#include "arena.h"
+#include "main.h"
+#include "thread_data.h"
+
+#include <atomic>
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if TBB_USE_ASSERT
+extern std::atomic<int> the_observer_proxy_count;
+#endif /* TBB_USE_ASSERT */
+
+observer_proxy::observer_proxy( d1::task_scheduler_observer& tso )
+    : my_ref_count(1), my_list(NULL), my_next(NULL), my_prev(NULL), my_observer(&tso)
+{
+#if TBB_USE_ASSERT
+    ++the_observer_proxy_count;
+#endif /* TBB_USE_ASSERT */
+}
+
+observer_proxy::~observer_proxy() {
+    __TBB_ASSERT( !my_ref_count, "Attempt to destroy proxy still in use" );
+    poison_value(my_ref_count);
+    poison_pointer(my_prev);
+    poison_pointer(my_next);
+#if TBB_USE_ASSERT
+    --the_observer_proxy_count;
+#endif /* TBB_USE_ASSERT */
+}
+
+void observer_list::clear() {
+    // Though the method will work fine for the empty list, we require the caller
+    // to check for the list emptiness before invoking it to avoid extra overhead.
+    __TBB_ASSERT( !empty(), NULL );
+    {
+        scoped_lock lock(mutex(), /*is_writer=*/true);
+        observer_proxy *next = my_head.load(std::memory_order_relaxed);
+        while ( observer_proxy *p = next ) {
+            next = p->my_next;
+            // Both proxy p and observer p->my_observer (if non-null) are guaranteed
+            // to be alive while the list is locked.
+            d1::task_scheduler_observer *obs = p->my_observer;
+            // Make sure that possible concurrent observer destruction does not
+            // conflict with the proxy list cleanup.
+            if (!obs || !(p = obs->my_proxy.exchange(nullptr))) {
+                continue;
+            }
+            // accessing 'obs' after detaching of obs->my_proxy leads to the race with observer destruction
+            __TBB_ASSERT(!next || p == next->my_prev, nullptr);
+            __TBB_ASSERT(is_alive(p->my_ref_count), "Observer's proxy died prematurely");
+            __TBB_ASSERT(p->my_ref_count.load(std::memory_order_relaxed) == 1, "Reference for observer is missing");
+            poison_pointer(p->my_observer);
+            remove(p);
+            --p->my_ref_count;
+            delete p;
+        }
+    }
+
+    // If observe(false) is called concurrently with the destruction of the arena,
+    // need to wait until all proxies are removed.
+    for (atomic_backoff backoff; ; backoff.pause()) {
+        scoped_lock lock(mutex(), /*is_writer=*/false);
+        if (my_head.load(std::memory_order_relaxed) == nullptr) {
+            break;
+        }
+    }
+
+    __TBB_ASSERT(my_head.load(std::memory_order_relaxed) == nullptr && my_tail.load(std::memory_order_relaxed) == nullptr, nullptr);
+}
+
+void observer_list::insert( observer_proxy* p ) {
+    scoped_lock lock(mutex(), /*is_writer=*/true);
+    if (my_head.load(std::memory_order_relaxed)) {
+        p->my_prev = my_tail.load(std::memory_order_relaxed);
+        my_tail.load(std::memory_order_relaxed)->my_next = p;
+    } else {
+        my_head.store(p, std::memory_order_relaxed);
+    }
+    my_tail.store(p, std::memory_order_relaxed);
+}
+
+void observer_list::remove(observer_proxy* p) {
+    __TBB_ASSERT(my_head.load(std::memory_order_relaxed), "Attempt to remove an item from an empty list");
+    __TBB_ASSERT(!my_tail.load(std::memory_order_relaxed)->my_next, "Last item's my_next must be NULL");
+    if (p == my_tail.load(std::memory_order_relaxed)) {
+        __TBB_ASSERT(!p->my_next, nullptr);
+        my_tail.store(p->my_prev, std::memory_order_relaxed);
+    } else {
+        __TBB_ASSERT(p->my_next, nullptr);
+        p->my_next->my_prev = p->my_prev;
+    }
+    if (p == my_head.load(std::memory_order_relaxed)) {
+        __TBB_ASSERT(!p->my_prev, nullptr);
+        my_head.store(p->my_next, std::memory_order_relaxed);
+    } else {
+        __TBB_ASSERT(p->my_prev, nullptr);
+        p->my_prev->my_next = p->my_next;
+    }
+    __TBB_ASSERT((my_head.load(std::memory_order_relaxed) && my_tail.load(std::memory_order_relaxed)) ||
+        (!my_head.load(std::memory_order_relaxed) && !my_tail.load(std::memory_order_relaxed)), nullptr);
+}
+
+void observer_list::remove_ref(observer_proxy* p) {
+    std::uintptr_t r = p->my_ref_count.load(std::memory_order_acquire);
+    __TBB_ASSERT(is_alive(r), nullptr);
+    while (r > 1) {
+        if (p->my_ref_count.compare_exchange_strong(r, r - 1)) {
+            return;
+        }
+    }
+    __TBB_ASSERT(r == 1, nullptr);
+    // Reference count might go to zero
+    {
+        // Use lock to avoid resurrection by a thread concurrently walking the list
+        observer_list::scoped_lock lock(mutex(), /*is_writer=*/true);
+        r = --p->my_ref_count;
+        if (!r) {
+            remove(p);
+        }
+    }
+    __TBB_ASSERT(r || !p->my_ref_count, nullptr);
+    if (!r) {
+        delete p;
+    }
+}
+
+void observer_list::do_notify_entry_observers(observer_proxy*& last, bool worker) {
+    // Pointer p marches though the list from last (exclusively) to the end.
+    observer_proxy* p = last, * prev = p;
+    for (;;) {
+        d1::task_scheduler_observer* tso = nullptr;
+        // Hold lock on list only long enough to advance to the next proxy in the list.
+        {
+            scoped_lock lock(mutex(), /*is_writer=*/false);
+            do {
+                if (p) {
+                    // We were already processing the list.
+                    if (observer_proxy* q = p->my_next) {
+                        if (p == prev) {
+                            remove_ref_fast(prev); // sets prev to NULL if successful
+                        }
+                        p = q;
+                    } else {
+                        // Reached the end of the list.
+                        if (p == prev) {
+                            // Keep the reference as we store the 'last' pointer in scheduler
+                            __TBB_ASSERT(int(p->my_ref_count.load(std::memory_order_relaxed)) >= 1 + (p->my_observer ? 1 : 0), nullptr);
+                        } else {
+                            // The last few proxies were empty
+                            __TBB_ASSERT(int(p->my_ref_count.load(std::memory_order_relaxed)), nullptr);
+                            ++p->my_ref_count;
+                            if (prev) {
+                                lock.release();
+                                remove_ref(prev);
+                            }
+                        }
+                        last = p;
+                        return;
+                    }
+                } else {
+                    // Starting pass through the list
+                    p = my_head.load(std::memory_order_relaxed);
+                    if (!p) {
+                        return;
+                    }
+                }
+                tso = p->my_observer;
+            } while (!tso);
+            ++p->my_ref_count;
+            ++tso->my_busy_count;
+        }
+        __TBB_ASSERT(!prev || p != prev, nullptr);
+        // Release the proxy pinned before p
+        if (prev) {
+            remove_ref(prev);
+        }
+        // Do not hold any locks on the list while calling user's code.
+        // Do not intercept any exceptions that may escape the callback so that
+        // they are either handled by the TBB scheduler or passed to the debugger.
+        tso->on_scheduler_entry(worker);
+        __TBB_ASSERT(p->my_ref_count.load(std::memory_order_relaxed), nullptr);
+        intptr_t bc = --tso->my_busy_count;
+        __TBB_ASSERT_EX(bc >= 0, "my_busy_count underflowed");
+        prev = p;
+    }
+}
+
+void observer_list::do_notify_exit_observers(observer_proxy* last, bool worker) {
+    // Pointer p marches though the list from the beginning to last (inclusively).
+    observer_proxy* p = nullptr, * prev = nullptr;
+    for (;;) {
+        d1::task_scheduler_observer* tso = nullptr;
+        // Hold lock on list only long enough to advance to the next proxy in the list.
+        {
+            scoped_lock lock(mutex(), /*is_writer=*/false);
+            do {
+                if (p) {
+                    // We were already processing the list.
+                    if (p != last) {
+                        __TBB_ASSERT(p->my_next, "List items before 'last' must have valid my_next pointer");
+                        if (p == prev)
+                            remove_ref_fast(prev); // sets prev to NULL if successful
+                        p = p->my_next;
+                    } else {
+                        // remove the reference from the last item
+                        remove_ref_fast(p);
+                        if (p) {
+                            lock.release();
+                            if (p != prev && prev) {
+                                remove_ref(prev);
+                            }
+                            remove_ref(p);
+                        }
+                        return;
+                    }
+                } else {
+                    // Starting pass through the list
+                    p = my_head.load(std::memory_order_relaxed);
+                    __TBB_ASSERT(p, "Nonzero 'last' must guarantee that the global list is non-empty");
+                }
+                tso = p->my_observer;
+            } while (!tso);
+            // The item is already refcounted
+            if (p != last) // the last is already referenced since entry notification
+                ++p->my_ref_count;
+            ++tso->my_busy_count;
+        }
+        __TBB_ASSERT(!prev || p != prev, nullptr);
+        if (prev)
+            remove_ref(prev);
+        // Do not hold any locks on the list while calling user's code.
+        // Do not intercept any exceptions that may escape the callback so that
+        // they are either handled by the TBB scheduler or passed to the debugger.
+        tso->on_scheduler_exit(worker);
+        __TBB_ASSERT(p->my_ref_count || p == last, nullptr);
+        intptr_t bc = --tso->my_busy_count;
+        __TBB_ASSERT_EX(bc >= 0, "my_busy_count underflowed");
+        prev = p;
+    }
+}
+
+void __TBB_EXPORTED_FUNC observe(d1::task_scheduler_observer &tso, bool enable) {
+    if( enable ) {
+        if( !tso.my_proxy.load(std::memory_order_relaxed) ) {
+            observer_proxy* p = new observer_proxy(tso);
+            tso.my_proxy.store(p, std::memory_order_relaxed);
+            tso.my_busy_count.store(0, std::memory_order_relaxed);
+
+            thread_data* td = governor::get_thread_data_if_initialized();
+            if (p->my_observer->my_task_arena == nullptr) {
+                if (!(td && td->my_arena)) {
+                    td = governor::get_thread_data();
+                }
+                __TBB_ASSERT(__TBB_InitOnce::initialization_done(), nullptr);
+                __TBB_ASSERT(td && td->my_arena, nullptr);
+                p->my_list = &td->my_arena->my_observers;
+            } else {
+                d1::task_arena* ta = p->my_observer->my_task_arena;
+                arena* a = ta->my_arena.load(std::memory_order_acquire);
+                if (a == nullptr) { // Avoid recursion during arena initialization
+                    ta->initialize();
+                    a = ta->my_arena.load(std::memory_order_relaxed);
+                }
+                __TBB_ASSERT(a != nullptr, nullptr);
+                p->my_list = &a->my_observers;
+            }
+            p->my_list->insert(p);
+            // Notify newly activated observer and other pending ones if it belongs to current arena
+            if (td && td->my_arena && &td->my_arena->my_observers == p->my_list) {
+                p->my_list->notify_entry_observers(td->my_last_observer, td->my_is_worker);
+            }
+        }
+    } else {
+        // Make sure that possible concurrent proxy list cleanup does not conflict
+        // with the observer destruction here.
+        if ( observer_proxy* proxy = tso.my_proxy.exchange(nullptr) ) {
+            // List destruction should not touch this proxy after we've won the above interlocked exchange.
+            __TBB_ASSERT( proxy->my_observer == &tso, nullptr);
+            __TBB_ASSERT( is_alive(proxy->my_ref_count.load(std::memory_order_relaxed)), "Observer's proxy died prematurely" );
+            __TBB_ASSERT( proxy->my_ref_count.load(std::memory_order_relaxed) >= 1, "reference for observer missing" );
+            observer_list &list = *proxy->my_list;
+            {
+                // Ensure that none of the list walkers relies on observer pointer validity
+                observer_list::scoped_lock lock(list.mutex(), /*is_writer=*/true);
+                proxy->my_observer = nullptr;
+                // Proxy may still be held by other threads (to track the last notified observer)
+                if( !--proxy->my_ref_count ) {// nobody can increase it under exclusive lock
+                    list.remove(proxy);
+                    __TBB_ASSERT( !proxy->my_ref_count, NULL );
+                    delete proxy;
+                }
+            }
+            spin_wait_until_eq(tso.my_busy_count, 0); // other threads are still accessing the callback
+        }
+    }
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/contrib/libs/tbb/src/tbb/observer_proxy.h b/contrib/libs/tbb/src/tbb/observer_proxy.h
new file mode 100644
index 0000000000..2450247ecd
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/observer_proxy.h
@@ -0,0 +1,154 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_observer_proxy_H
+#define __TBB_observer_proxy_H
+
+#include "oneapi/tbb/detail/_config.h"
+#include "oneapi/tbb/detail/_aligned_space.h"
+
+#include "oneapi/tbb/task_scheduler_observer.h"
+#include "oneapi/tbb/spin_rw_mutex.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class observer_list {
+    friend class arena;
+
+    // Mutex is wrapped with aligned_space to shut up warnings when its destructor
+    // is called while threads are still using it.
+    typedef aligned_space<spin_rw_mutex>  my_mutex_type;
+
+    //! Pointer to the head of this list.
+    std::atomic<observer_proxy*> my_head{nullptr};
+
+    //! Pointer to the tail of this list.
+    std::atomic<observer_proxy*> my_tail{nullptr};
+
+    //! Mutex protecting this list.
+    my_mutex_type my_mutex;
+
+    //! Back-pointer to the arena this list belongs to.
+    arena* my_arena;
+
+    //! Decrement refcount of the proxy p if there are other outstanding references.
+    /** In case of success sets p to NULL. Must be invoked from under the list lock. **/
+    inline static void remove_ref_fast( observer_proxy*& p );
+
+    //! Implements notify_entry_observers functionality.
+    void do_notify_entry_observers( observer_proxy*& last, bool worker );
+
+    //! Implements notify_exit_observers functionality.
+    void do_notify_exit_observers( observer_proxy* last, bool worker );
+
+public:
+    observer_list () = default;
+
+    //! Removes and destroys all observer proxies from the list.
+    /** Cannot be used concurrently with other methods. **/
+    void clear ();
+
+    //! Add observer proxy to the tail of the list.
+    void insert ( observer_proxy* p );
+
+    //! Remove observer proxy from the list.
+    void remove ( observer_proxy* p );
+
+    //! Decrement refcount of the proxy and destroy it if necessary.
+    /** When refcount reaches zero removes the proxy from the list and destructs it. **/
+    void remove_ref( observer_proxy* p );
+
+    //! Type of the scoped lock for the reader-writer mutex associated with the list.
+    typedef spin_rw_mutex::scoped_lock scoped_lock;
+
+    //! Accessor to the reader-writer mutex associated with the list.
+    spin_rw_mutex& mutex () { return my_mutex.begin()[0]; }
+
+    bool empty () const { return my_head.load(std::memory_order_relaxed) == nullptr; }
+
+    //! Call entry notifications on observers added after last was notified.
+    /** Updates last to become the last notified observer proxy (in the global list)
+        or leaves it to be nullptr. The proxy has its refcount incremented. **/
+    inline void notify_entry_observers( observer_proxy*& last, bool worker );
+
+    //! Call exit notifications on last and observers added before it.
+    inline void notify_exit_observers( observer_proxy*& last, bool worker );
+}; // class observer_list
+
+//! Wrapper for an observer object
+/** To maintain shared lists of observers the scheduler first wraps each observer
+    object into a proxy so that a list item remained valid even after the corresponding
+    proxy object is destroyed by the user code. **/
+class observer_proxy {
+    friend class task_scheduler_observer;
+    friend class observer_list;
+    friend void observe(d1::task_scheduler_observer&, bool);
+    //! Reference count used for garbage collection.
+    /** 1 for reference from my task_scheduler_observer.
+        1 for each task dispatcher's last observer pointer.
+        No accounting for neighbors in the shared list. */
+    std::atomic<std::uintptr_t> my_ref_count;
+    //! Reference to the list this observer belongs to.
+    observer_list* my_list;
+    //! Pointer to next observer in the list specified by my_head.
+    /** NULL for the last item in the list. **/
+    observer_proxy* my_next;
+    //! Pointer to the previous observer in the list specified by my_head.
+    /** For the head of the list points to the last item. **/
+    observer_proxy* my_prev;
+    //! Associated observer
+    d1::task_scheduler_observer* my_observer;
+
+    //! Constructs proxy for the given observer and adds it to the specified list.
+    observer_proxy( d1::task_scheduler_observer& );
+
+    ~observer_proxy();
+}; // class observer_proxy
+
+void observer_list::remove_ref_fast( observer_proxy*& p ) {
+    if( p->my_observer ) {
+        // Can decrement refcount quickly, as it cannot drop to zero while under the lock.
+        std::uintptr_t r = --p->my_ref_count;
+        __TBB_ASSERT_EX( r, NULL );
+        p = NULL;
+    } else {
+        // Use slow form of refcount decrementing, after the lock is released.
+    }
+}
+
+void observer_list::notify_entry_observers(observer_proxy*& last, bool worker) {
+    if (last == my_tail.load(std::memory_order_relaxed))
+        return;
+    do_notify_entry_observers(last, worker);
+}
+
+void observer_list::notify_exit_observers( observer_proxy*& last, bool worker ) {
+    if (last == nullptr) {
+        return;
+    }
+    __TBB_ASSERT(!is_poisoned(last), NULL);
+    do_notify_exit_observers( last, worker );
+    __TBB_ASSERT(last != nullptr, NULL);
+    poison_pointer(last);
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB_observer_proxy_H */
diff --git a/contrib/libs/tbb/src/tbb/parallel_pipeline.cpp b/contrib/libs/tbb/src/tbb/parallel_pipeline.cpp
new file mode 100644
index 0000000000..b7655c6b35
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/parallel_pipeline.cpp
@@ -0,0 +1,471 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "oneapi/tbb/parallel_pipeline.h"
+#include "oneapi/tbb/spin_mutex.h"
+#include "oneapi/tbb/tbb_allocator.h"
+#include "oneapi/tbb/cache_aligned_allocator.h"
+#include "itt_notify.h"
+#include "tls.h"
+#include "oneapi/tbb/detail/_exception.h"
+#include "oneapi/tbb/detail/_small_object_pool.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+void handle_perror(int error_code, const char* aux_info);
+
+using Token = unsigned long;
+
+//! A processing pipeline that applies filters to items.
+/** @ingroup algorithms */
+class pipeline {
+    friend void parallel_pipeline(d1::task_group_context&, std::size_t, const d1::filter_node&);
+public:
+
+    //! Construct empty pipeline.
+    pipeline(d1::task_group_context& cxt, std::size_t max_token) :
+        my_context(cxt),
+        first_filter(nullptr),
+        last_filter(nullptr),
+        input_tokens(Token(max_token)),
+        end_of_input(false),
+        wait_ctx(0) {
+            __TBB_ASSERT( max_token>0, "pipeline::run must have at least one token" );
+        }
+
+    ~pipeline();
+
+    //! Add filter to end of pipeline.
+    void add_filter( d1::base_filter& );
+
+    //! Traverse tree of fitler-node in-order and add filter for each leaf
+    void fill_pipeline(const d1::filter_node& root) {
+        if( root.left && root.right ) {
+            fill_pipeline(*root.left);
+            fill_pipeline(*root.right);
+        }
+        else {
+            __TBB_ASSERT(!root.left && !root.right, "tree should be full");
+            add_filter(*root.create_filter());
+        }
+    }
+
+private:
+    friend class stage_task;
+    friend class base_filter;
+    friend void set_end_of_input(d1::base_filter& bf);
+
+    task_group_context& my_context;
+
+    //! Pointer to first filter in the pipeline.
+    d1::base_filter* first_filter;
+
+    //! Pointer to last filter in the pipeline.
+    d1::base_filter* last_filter;
+
+    //! Number of idle tokens waiting for input stage.
+    std::atomic<Token> input_tokens;
+
+    //! False until flow_control::stop() is called.
+    std::atomic<bool> end_of_input;
+
+    d1::wait_context wait_ctx;
+};
+
+//! This structure is used to store task information in a input buffer
+struct task_info {
+    void* my_object = nullptr;
+    //! Invalid unless a task went through an ordered stage.
+    Token my_token = 0;
+    //! False until my_token is set.
+    bool my_token_ready  = false;
+    //! True if my_object is valid.
+    bool is_valid = false;
+    //! Set to initial state (no object, no token)
+    void reset() {
+        my_object = nullptr;
+        my_token = 0;
+        my_token_ready = false;
+        is_valid = false;
+    }
+};
+
+//! A buffer of input items for a filter.
+/** Each item is a task_info, inserted into a position in the buffer corresponding to a Token. */
+class input_buffer {
+    friend class base_filter;
+    friend class stage_task;
+    friend class pipeline;
+    friend void set_end_of_input(d1::base_filter& bf);
+
+    using size_type = Token;
+
+    //! Array of deferred tasks that cannot yet start executing.
+    task_info* array;
+
+    //! Size of array
+    /** Always 0 or a power of 2 */
+    size_type array_size;
+
+    //! Lowest token that can start executing.
+    /** All prior Token have already been seen. */
+    Token low_token;
+
+    //! Serializes updates.
+    spin_mutex array_mutex;
+
+    //! Resize "array".
+    /** Caller is responsible to acquiring a lock on "array_mutex". */
+    void grow( size_type minimum_size );
+
+    //! Initial size for "array"
+    /** Must be a power of 2 */
+    static const size_type initial_buffer_size = 4;
+
+    //! Used for out of order buffer, and for assigning my_token if is_ordered and my_token not already assigned
+    Token high_token;
+
+    //! True for ordered filter, false otherwise.
+    const bool is_ordered;
+
+    //! for parallel filters that accepts NULLs, thread-local flag for reaching end_of_input
+    using end_of_input_tls_t = basic_tls<std::intptr_t>;
+    end_of_input_tls_t end_of_input_tls;
+    bool end_of_input_tls_allocated; // no way to test pthread creation of TLS
+
+public:
+    input_buffer(const input_buffer&) = delete;
+    input_buffer& operator=(const input_buffer&) = delete;
+
+    //! Construct empty buffer.
+    input_buffer( bool ordered) :
+            array(nullptr),
+            array_size(0),
+            low_token(0),
+            high_token(0),
+            is_ordered(ordered),
+            end_of_input_tls(),
+            end_of_input_tls_allocated(false) {
+        grow(initial_buffer_size);
+        __TBB_ASSERT( array, nullptr );
+    }
+
+    //! Destroy the buffer.
+    ~input_buffer() {
+        __TBB_ASSERT( array, nullptr );
+        cache_aligned_allocator<task_info>().deallocate(array,array_size);
+        poison_pointer( array );
+        if( end_of_input_tls_allocated ) {
+            destroy_my_tls();
+        }
+    }
+
+    //! Define order when the first filter is serial_in_order.
+    Token get_ordered_token(){
+        return high_token++;
+    }
+
+    //! Put a token into the buffer.
+    /** If task information was placed into buffer, returns true;
+        otherwise returns false, informing the caller to create and spawn a task.
+    */
+    bool try_put_token( task_info& info ) {
+        info.is_valid = true;
+        spin_mutex::scoped_lock lock( array_mutex );
+        Token token;
+        if( is_ordered ) {
+            if( !info.my_token_ready ) {
+                info.my_token = high_token++;
+                info.my_token_ready = true;
+            }
+            token = info.my_token;
+        } else
+            token = high_token++;
+        __TBB_ASSERT( (long)(token-low_token)>=0, nullptr );
+        if( token!=low_token ) {
+            // Trying to put token that is beyond low_token.
+            // Need to wait until low_token catches up before dispatching.
+            if( token-low_token>=array_size )
+                grow( token-low_token+1 );
+            ITT_NOTIFY( sync_releasing, this );
+            array[token&(array_size-1)] = info;
+            return true;
+        }
+        return false;
+    }
+
+    //! Note that processing of a token is finished.
+    /** Fires up processing of the next token, if processing was deferred. */
+    // Uses template to avoid explicit dependency on stage_task.
+    template<typename StageTask>
+    void try_to_spawn_task_for_next_token(StageTask& spawner, d1::execution_data& ed) {
+        task_info wakee;
+        {
+            spin_mutex::scoped_lock lock( array_mutex );
+            // Wake the next task
+            task_info& item = array[++low_token & (array_size-1)];
+            ITT_NOTIFY( sync_acquired, this );
+            wakee = item;
+            item.is_valid = false;
+        }
+        if( wakee.is_valid )
+            spawner.spawn_stage_task(wakee, ed);
+    }
+
+    // end_of_input signal for parallel_pipeline, parallel input filters with 0 tokens allowed.
+    void create_my_tls() {
+        int status = end_of_input_tls.create();
+        if(status)
+            handle_perror(status, "TLS not allocated for filter");
+        end_of_input_tls_allocated = true;
+    }
+    void destroy_my_tls() {
+        int status = end_of_input_tls.destroy();
+        if(status)
+            handle_perror(status, "Failed to destroy filter TLS");
+    }
+    bool my_tls_end_of_input() {
+        return end_of_input_tls.get() != 0;
+    }
+    void set_my_tls_end_of_input() {
+        end_of_input_tls.set(1);
+    }
+};
+
+void input_buffer::grow( size_type minimum_size ) {
+    size_type old_size = array_size;
+    size_type new_size = old_size ? 2*old_size : initial_buffer_size;
+    while( new_size<minimum_size )
+        new_size*=2;
+    task_info* new_array = cache_aligned_allocator<task_info>().allocate(new_size);
+    task_info* old_array = array;
+    for( size_type i=0; i<new_size; ++i )
+        new_array[i].is_valid = false;
+    Token t=low_token;
+    for( size_type i=0; i<old_size; ++i, ++t )
+        new_array[t&(new_size-1)] = old_array[t&(old_size-1)];
+    array = new_array;
+    array_size = new_size;
+    if( old_array )
+        cache_aligned_allocator<task_info>().deallocate(old_array,old_size);
+}
+
+class stage_task : public d1::task, public task_info {
+private:
+    friend class pipeline;
+    pipeline& my_pipeline;
+    d1::base_filter* my_filter;
+    d1::small_object_allocator m_allocator;
+    //! True if this task has not yet read the input.
+    bool my_at_start;
+
+    //! True if this can be executed again.
+    bool execute_filter(d1::execution_data& ed);
+
+    //! Spawn task if token is available.
+    void try_spawn_stage_task(d1::execution_data& ed) {
+        ITT_NOTIFY( sync_releasing, &my_pipeline.input_tokens );
+        if( (my_pipeline.input_tokens.fetch_sub(1, std::memory_order_relaxed)) > 1 ) {
+            d1::small_object_allocator alloc{};
+            r1::spawn( *alloc.new_object<stage_task>(ed, my_pipeline, alloc ), my_pipeline.my_context );
+        }
+    }
+
+public:
+
+    //! Construct stage_task for first stage in a pipeline.
+    /** Such a stage has not read any input yet. */
+    stage_task(pipeline& pipeline, d1::small_object_allocator& alloc ) :
+        my_pipeline(pipeline),
+        my_filter(pipeline.first_filter),
+        m_allocator(alloc),
+        my_at_start(true)
+    {
+        task_info::reset();
+        my_pipeline.wait_ctx.reserve();
+    }
+    //! Construct stage_task for a subsequent stage in a pipeline.
+    stage_task(pipeline& pipeline, d1::base_filter* filter, const task_info& info, d1::small_object_allocator& alloc) :
+        task_info(info),
+        my_pipeline(pipeline),
+        my_filter(filter),
+        m_allocator(alloc),
+        my_at_start(false)
+    {
+        my_pipeline.wait_ctx.reserve();
+    }
+    //! Roughly equivalent to the constructor of input stage task
+    void reset() {
+        task_info::reset();
+        my_filter = my_pipeline.first_filter;
+        my_at_start = true;
+    }
+    void finalize(d1::execution_data& ed) {
+        m_allocator.delete_object(this, ed);
+    }
+    //! The virtual task execution method
+    task* execute(d1::execution_data& ed) override {
+        if(!execute_filter(ed)) {
+            finalize(ed);
+            return nullptr;
+        }
+        return this;
+    }
+    task* cancel(d1::execution_data& ed) override {
+        finalize(ed);
+        return nullptr;
+    }
+
+    ~stage_task() {
+        if ( my_filter && my_object ) {
+            my_filter->finalize(my_object);
+            my_object = nullptr;
+        }
+        my_pipeline.wait_ctx.release();
+    }
+    //! Creates and spawns stage_task from task_info
+    void spawn_stage_task(const task_info& info, d1::execution_data& ed) {
+        d1::small_object_allocator alloc{};
+        stage_task* clone = alloc.new_object<stage_task>(ed, my_pipeline, my_filter, info, alloc);
+        r1::spawn(*clone, my_pipeline.my_context);
+    }
+};
+
+bool stage_task::execute_filter(d1::execution_data& ed) {
+    __TBB_ASSERT( !my_at_start || !my_object, "invalid state of task" );
+    if( my_at_start ) {
+        if( my_filter->is_serial() ) {
+            my_object = (*my_filter)(my_object);
+            if( my_object || ( my_filter->object_may_be_null() && !my_pipeline.end_of_input.load(std::memory_order_relaxed)) ) {
+                if( my_filter->is_ordered() ) {
+                    my_token = my_filter->my_input_buffer->get_ordered_token();
+                    my_token_ready = true;
+                }
+                if( !my_filter->next_filter_in_pipeline ) { // we're only filter in pipeline
+                    reset();
+                    return true;
+                } else {
+                    try_spawn_stage_task(ed);
+                }
+            } else {
+                my_pipeline.end_of_input.store(true, std::memory_order_relaxed);
+                return false;
+            }
+        } else /*not is_serial*/ {
+            if ( my_pipeline.end_of_input.load(std::memory_order_relaxed) ) {
+                return false;
+            }
+
+            try_spawn_stage_task(ed);
+
+            my_object = (*my_filter)(my_object);
+            if( !my_object && (!my_filter->object_may_be_null() || my_filter->my_input_buffer->my_tls_end_of_input()) ){
+                my_pipeline.end_of_input.store(true, std::memory_order_relaxed);
+                return false;
+            }
+        }
+        my_at_start = false;
+    } else {
+        my_object = (*my_filter)(my_object);
+        if( my_filter->is_serial() )
+            my_filter->my_input_buffer->try_to_spawn_task_for_next_token(*this, ed);
+    }
+    my_filter = my_filter->next_filter_in_pipeline;
+    if( my_filter ) {
+        // There is another filter to execute.
+        if( my_filter->is_serial() ) {
+            // The next filter must execute tokens when they are available (in order for serial_in_order)
+            if( my_filter->my_input_buffer->try_put_token(*this) ){
+                my_filter = nullptr; // To prevent deleting my_object twice if exception occurs
+                return false;
+            }
+        }
+    } else {
+        // Reached end of the pipe.
+        std::size_t ntokens_avail = my_pipeline.input_tokens.fetch_add(1, std::memory_order_relaxed);
+
+        if( ntokens_avail>0  // Only recycle if there is one available token
+                || my_pipeline.end_of_input.load(std::memory_order_relaxed) ) {
+            return false; // No need to recycle for new input
+        }
+        ITT_NOTIFY( sync_acquired, &my_pipeline.input_tokens );
+        // Recycle as an input stage task.
+        reset();
+    }
+    return true;
+}
+
+pipeline:: ~pipeline() {
+    while( first_filter ) {
+        d1::base_filter* f = first_filter;
+        if( input_buffer* b = f->my_input_buffer ) {
+            b->~input_buffer();
+            deallocate_memory(b);
+        }
+        first_filter = f->next_filter_in_pipeline;
+        f->~base_filter();
+        deallocate_memory(f);
+    }
+}
+
+void pipeline::add_filter( d1::base_filter& new_fitler ) {
+    __TBB_ASSERT( new_fitler.next_filter_in_pipeline==d1::base_filter::not_in_pipeline(), "filter already part of pipeline?" );
+    new_fitler.my_pipeline = this;
+    if ( first_filter == nullptr )
+        first_filter = &new_fitler;
+    else
+        last_filter->next_filter_in_pipeline = &new_fitler;
+    new_fitler.next_filter_in_pipeline = nullptr;
+    last_filter = &new_fitler;
+    if( new_fitler.is_serial() ) {
+        new_fitler.my_input_buffer = new (allocate_memory(sizeof(input_buffer))) input_buffer( new_fitler.is_ordered() );
+    } else {
+        if( first_filter == &new_fitler && new_fitler.object_may_be_null() ) {
+            //TODO: buffer only needed to hold TLS; could improve
+            new_fitler.my_input_buffer = new (allocate_memory(sizeof(input_buffer))) input_buffer( /*is_ordered*/false );
+            new_fitler.my_input_buffer->create_my_tls();
+        }
+    }
+}
+
+void __TBB_EXPORTED_FUNC parallel_pipeline(d1::task_group_context& cxt, std::size_t max_token, const d1::filter_node& fn) {
+    pipeline pipe(cxt, max_token);
+
+    pipe.fill_pipeline(fn);
+
+    d1::small_object_allocator alloc{};
+    stage_task& st = *alloc.new_object<stage_task>(pipe, alloc);
+
+    // Start execution of tasks
+    r1::execute_and_wait(st, cxt, pipe.wait_ctx, cxt);
+}
+
+void __TBB_EXPORTED_FUNC set_end_of_input(d1::base_filter& bf) {
+    __TBB_ASSERT(bf.my_input_buffer, nullptr);
+    __TBB_ASSERT(bf.object_may_be_null(), nullptr);
+    if(bf.is_serial() ) {
+        bf.my_pipeline->end_of_input.store(true, std::memory_order_relaxed);
+    } else {
+        __TBB_ASSERT(bf.my_input_buffer->end_of_input_tls_allocated, nullptr);
+        bf.my_input_buffer->set_my_tls_end_of_input();
+    }
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/contrib/libs/tbb/src/tbb/private_server.cpp b/contrib/libs/tbb/src/tbb/private_server.cpp
new file mode 100644
index 0000000000..bc0af84bb4
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/private_server.cpp
@@ -0,0 +1,420 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "oneapi/tbb/cache_aligned_allocator.h"
+
+#include "rml_tbb.h"
+#include "rml_thread_monitor.h"
+
+#include "scheduler_common.h"
+#include "governor.h"
+#include "misc.h"
+
+#include <atomic>
+
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+namespace rml {
+
+using rml::internal::thread_monitor;
+typedef thread_monitor::handle_type thread_handle;
+
+class private_server;
+
+class private_worker: no_copy {
+private:
+    //! State in finite-state machine that controls the worker.
+    /** State diagram:
+        init --> starting --> normal
+          |         |           |
+          |         V           |
+          \------> quit <------/
+      */
+    enum state_t {
+        //! *this is initialized
+        st_init,
+        //! *this has associated thread that is starting up.
+        st_starting,
+        //! Associated thread is doing normal life sequence.
+        st_normal,
+        //! Associated thread has ended normal life sequence and promises to never touch *this again.
+        st_quit
+    };
+    std::atomic<state_t> my_state;
+
+    //! Associated server
+    private_server& my_server;
+
+    //! Associated client
+    tbb_client& my_client;
+
+    //! index used for avoiding the 64K aliasing problem
+    const std::size_t my_index;
+
+    //! Monitor for sleeping when there is no work to do.
+    /** The invariant that holds for sleeping workers is:
+        "my_slack<=0 && my_state==st_normal && I am on server's list of asleep threads" */
+    thread_monitor my_thread_monitor;
+
+    //! Handle of the OS thread associated with this worker
+    thread_handle my_handle;
+
+    //! Link for list of workers that are sleeping or have no associated thread.
+    private_worker* my_next;
+
+    friend class private_server;
+
+    //! Actions executed by the associated thread
+    void run() noexcept;
+
+    //! Wake up associated thread (or launch a thread if there is none)
+    void wake_or_launch();
+
+    //! Called by a thread (usually not the associated thread) to commence termination.
+    void start_shutdown();
+
+    static __RML_DECL_THREAD_ROUTINE thread_routine( void* arg );
+
+    static void release_handle(thread_handle my_handle, bool join);
+
+protected:
+    private_worker( private_server& server, tbb_client& client, const std::size_t i ) :
+        my_state(st_init), my_server(server), my_client(client), my_index(i),
+        my_thread_monitor(), my_handle(), my_next()
+    {}
+};
+
+static const std::size_t cache_line_size = tbb::detail::max_nfs_size;
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Suppress overzealous compiler warnings about uninstantiable class
+    #pragma warning(push)
+    #pragma warning(disable:4510 4610)
+#endif
+class padded_private_worker: public private_worker {
+    char pad[cache_line_size - sizeof(private_worker)%cache_line_size];
+public:
+    padded_private_worker( private_server& server, tbb_client& client, const std::size_t i )
+    : private_worker(server,client,i) { suppress_unused_warning(pad); }
+};
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    #pragma warning(pop)
+#endif
+
+class private_server: public tbb_server, no_copy {
+private:
+    tbb_client& my_client;
+    //! Maximum number of threads to be created.
+    /** Threads are created lazily, so maximum might not actually be reached. */
+    const tbb_client::size_type my_n_thread;
+
+    //! Stack size for each thread. */
+    const std::size_t my_stack_size;
+
+    //! Number of jobs that could use their associated thread minus number of active threads.
+    /** If negative, indicates oversubscription.
+        If positive, indicates that more threads should run.
+        Can be lowered asynchronously, but must be raised only while holding my_asleep_list_mutex,
+        because raising it impacts the invariant for sleeping threads. */
+    std::atomic<int> my_slack;
+
+    //! Counter used to determine when to delete this.
+    std::atomic<int> my_ref_count;
+
+    padded_private_worker* my_thread_array;
+
+    //! List of workers that are asleep or committed to sleeping until notified by another thread.
+    std::atomic<private_worker*> my_asleep_list_root;
+
+    //! Protects my_asleep_list_root
+    typedef scheduler_mutex_type asleep_list_mutex_type;
+    asleep_list_mutex_type my_asleep_list_mutex;
+
+#if TBB_USE_ASSERT
+    std::atomic<int> my_net_slack_requests;
+#endif /* TBB_USE_ASSERT */
+
+    //! Wake up to two sleeping workers, if there are any sleeping.
+    /** The call is used to propagate a chain reaction where each thread wakes up two threads,
+        which in turn each wake up two threads, etc. */
+    void propagate_chain_reaction() {
+        // First test of a double-check idiom.  Second test is inside wake_some(0).
+        if( my_asleep_list_root.load(std::memory_order_acquire) )
+            wake_some(0);
+    }
+
+    //! Try to add t to list of sleeping workers
+    bool try_insert_in_asleep_list( private_worker& t );
+
+    //! Equivalent of adding additional_slack to my_slack and waking up to 2 threads if my_slack permits.
+    void wake_some( int additional_slack );
+
+    virtual ~private_server();
+
+    void remove_server_ref() {
+        if( --my_ref_count==0 ) {
+            my_client.acknowledge_close_connection();
+            this->~private_server();
+            tbb::cache_aligned_allocator<private_server>().deallocate( this, 1 );
+        }
+    }
+
+    friend class private_worker;
+public:
+    private_server( tbb_client& client );
+
+    version_type version() const override {
+        return 0;
+    }
+
+    void request_close_connection( bool /*exiting*/ ) override {
+        for( std::size_t i=0; i<my_n_thread; ++i )
+            my_thread_array[i].start_shutdown();
+        remove_server_ref();
+    }
+
+    void yield() override { d0::yield(); }
+
+    void independent_thread_number_changed( int ) override {__TBB_ASSERT(false,NULL);}
+
+    unsigned default_concurrency() const override { return governor::default_num_threads() - 1; }
+
+    void adjust_job_count_estimate( int delta ) override;
+
+#if _WIN32||_WIN64
+    void register_external_thread ( ::rml::server::execution_resource_t& ) override {}
+    void unregister_external_thread ( ::rml::server::execution_resource_t ) override {}
+#endif /* _WIN32||_WIN64 */
+};
+
+//------------------------------------------------------------------------
+// Methods of private_worker
+//------------------------------------------------------------------------
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Suppress overzealous compiler warnings about an initialized variable 'sink_for_alloca' not referenced
+    #pragma warning(push)
+    #pragma warning(disable:4189)
+#endif
+#if __MINGW32__ && __GNUC__==4 &&__GNUC_MINOR__>=2 && !__MINGW64__
+// ensure that stack is properly aligned for TBB threads
+__attribute__((force_align_arg_pointer))
+#endif
+__RML_DECL_THREAD_ROUTINE private_worker::thread_routine( void* arg ) {
+    private_worker* self = static_cast<private_worker*>(arg);
+    AVOID_64K_ALIASING( self->my_index );
+    self->run();
+    return 0;
+}
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    #pragma warning(pop)
+#endif
+
+void private_worker::release_handle(thread_handle handle, bool join) {
+    if (join)
+        thread_monitor::join(handle);
+    else
+        thread_monitor::detach_thread(handle);
+}
+
+void private_worker::start_shutdown() {
+    state_t expected_state = my_state.load(std::memory_order_acquire);
+    __TBB_ASSERT( expected_state!=st_quit, NULL );
+
+    while( !my_state.compare_exchange_strong( expected_state, st_quit ) );
+
+    if( expected_state==st_normal || expected_state==st_starting ) {
+        // May have invalidated invariant for sleeping, so wake up the thread.
+        // Note that the notify() here occurs without maintaining invariants for my_slack.
+        // It does not matter, because my_state==st_quit overrides checking of my_slack.
+        my_thread_monitor.notify();
+        // Do not need release handle in st_init state,
+        // because in this case the thread wasn't started yet.
+        // For st_starting release is done at launch site.
+        if (expected_state==st_normal)
+            release_handle(my_handle, governor::does_client_join_workers(my_client));
+    } else if( expected_state==st_init ) {
+        // Perform action that otherwise would be performed by associated thread when it quits.
+        my_server.remove_server_ref();
+    }
+}
+
+void private_worker::run() noexcept {
+    my_server.propagate_chain_reaction();
+
+    // Transiting to st_normal here would require setting my_handle,
+    // which would create race with the launching thread and
+    // complications in handle management on Windows.
+
+    ::rml::job& j = *my_client.create_one_job();
+    while( my_state.load(std::memory_order_acquire)!=st_quit ) {
+        if( my_server.my_slack.load(std::memory_order_acquire)>=0 ) {
+            my_client.process(j);
+        } else {
+            thread_monitor::cookie c;
+            // Prepare to wait
+            my_thread_monitor.prepare_wait(c);
+            // Check/set the invariant for sleeping
+            if( my_state.load(std::memory_order_acquire)!=st_quit && my_server.try_insert_in_asleep_list(*this) ) {
+                my_thread_monitor.commit_wait(c);
+                __TBB_ASSERT( my_state==st_quit || !my_next, "Thread monitor missed a spurious wakeup?" );
+                my_server.propagate_chain_reaction();
+            } else {
+                // Invariant broken
+                my_thread_monitor.cancel_wait();
+            }
+        }
+    }
+    my_client.cleanup(j);
+
+    ++my_server.my_slack;
+    my_server.remove_server_ref();
+}
+
+inline void private_worker::wake_or_launch() {
+    state_t expected_state = st_init;
+    if( my_state.compare_exchange_strong( expected_state, st_starting ) ) {
+        // after this point, remove_server_ref() must be done by created thread
+#if __TBB_USE_WINAPI
+        my_handle = thread_monitor::launch( thread_routine, this, my_server.my_stack_size, &this->my_index );
+#elif __TBB_USE_POSIX
+        {
+        affinity_helper fpa;
+        fpa.protect_affinity_mask( /*restore_process_mask=*/true );
+        my_handle = thread_monitor::launch( thread_routine, this, my_server.my_stack_size );
+        // Implicit destruction of fpa resets original affinity mask.
+        }
+#endif /* __TBB_USE_POSIX */
+        expected_state = st_starting;
+        if ( !my_state.compare_exchange_strong( expected_state, st_normal ) ) {
+            // Do shutdown during startup. my_handle can't be released
+            // by start_shutdown, because my_handle value might be not set yet
+            // at time of transition from st_starting to st_quit.
+            __TBB_ASSERT( expected_state==st_quit, NULL );
+            release_handle(my_handle, governor::does_client_join_workers(my_client));
+        }
+    }
+    else {
+        __TBB_ASSERT( !my_next, "Should not wake a thread while it's still in asleep list" );
+        my_thread_monitor.notify();
+    }
+}
+
+//------------------------------------------------------------------------
+// Methods of private_server
+//------------------------------------------------------------------------
+private_server::private_server( tbb_client& client ) :
+    my_client(client),
+    my_n_thread(client.max_job_count()),
+    my_stack_size(client.min_stack_size()),
+    my_slack(0),
+    my_ref_count(my_n_thread+1),
+    my_thread_array(NULL),
+    my_asleep_list_root(NULL)
+#if TBB_USE_ASSERT
+    , my_net_slack_requests(0)
+#endif /* TBB_USE_ASSERT */
+{
+    my_thread_array = tbb::cache_aligned_allocator<padded_private_worker>().allocate( my_n_thread );
+    for( std::size_t i=0; i<my_n_thread; ++i ) {
+        private_worker* t = new( &my_thread_array[i] ) padded_private_worker( *this, client, i );
+        t->my_next = my_asleep_list_root.exchange(t, std::memory_order_relaxed);
+    }
+}
+
+private_server::~private_server() {
+    __TBB_ASSERT( my_net_slack_requests==0, NULL );
+    for( std::size_t i=my_n_thread; i--; )
+        my_thread_array[i].~padded_private_worker();
+    tbb::cache_aligned_allocator<padded_private_worker>().deallocate( my_thread_array, my_n_thread );
+    tbb::detail::poison_pointer( my_thread_array );
+}
+
+inline bool private_server::try_insert_in_asleep_list( private_worker& t ) {
+    asleep_list_mutex_type::scoped_lock lock;
+    if( !lock.try_acquire(my_asleep_list_mutex) )
+        return false;
+    // Contribute to slack under lock so that if another takes that unit of slack,
+    // it sees us sleeping on the list and wakes us up.
+    int k = ++my_slack;
+    if( k<=0 ) {
+        t.my_next = my_asleep_list_root.exchange(&t, std::memory_order_relaxed);
+        return true;
+    } else {
+        --my_slack;
+        return false;
+    }
+}
+
+void private_server::wake_some( int additional_slack ) {
+    __TBB_ASSERT( additional_slack>=0, NULL );
+    private_worker* wakee[2];
+    private_worker**w = wakee;
+    {
+        asleep_list_mutex_type::scoped_lock lock(my_asleep_list_mutex);
+        while( my_asleep_list_root.load(std::memory_order_relaxed) && w<wakee+2 ) {
+            if( additional_slack>0 ) {
+                // additional demand does not exceed surplus supply
+                if ( additional_slack+my_slack.load(std::memory_order_acquire)<=0 )
+                    break;
+                --additional_slack;
+            } else {
+                // Chain reaction; Try to claim unit of slack
+                int old = my_slack;
+                do {
+                    if( old<=0 ) goto done;
+                } while( !my_slack.compare_exchange_strong(old,old-1) );
+            }
+            // Pop sleeping worker to combine with claimed unit of slack
+            auto old = my_asleep_list_root.load(std::memory_order_relaxed);
+            my_asleep_list_root.store(old->my_next, std::memory_order_relaxed);
+            *w++ = old;
+        }
+        if( additional_slack ) {
+            // Contribute our unused slack to my_slack.
+            my_slack += additional_slack;
+        }
+    }
+done:
+    while( w>wakee ) {
+        private_worker* ww = *--w;
+        ww->my_next = NULL;
+        ww->wake_or_launch();
+    }
+}
+
+void private_server::adjust_job_count_estimate( int delta ) {
+#if TBB_USE_ASSERT
+    my_net_slack_requests+=delta;
+#endif /* TBB_USE_ASSERT */
+    if( delta<0 ) {
+        my_slack+=delta;
+    } else if( delta>0 ) {
+        wake_some( delta );
+    }
+}
+
+//! Factory method called from task.cpp to create a private_server.
+tbb_server* make_private_server( tbb_client& client ) {
+    return new( tbb::cache_aligned_allocator<private_server>().allocate(1) ) private_server(client);
+}
+
+} // namespace rml
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
diff --git a/contrib/libs/tbb/src/tbb/profiling.cpp b/contrib/libs/tbb/src/tbb/profiling.cpp
new file mode 100644
index 0000000000..2603f35b88
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/profiling.cpp
@@ -0,0 +1,265 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "oneapi/tbb/detail/_config.h"
+#include "oneapi/tbb/detail/_template_helpers.h"
+
+#include "main.h"
+#include "itt_notify.h"
+
+#include "oneapi/tbb/profiling.h"
+
+#include <string.h>
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if __TBB_USE_ITT_NOTIFY
+bool ITT_Present;
+static std::atomic<bool> ITT_InitializationDone;
+
+static __itt_domain *tbb_domains[d1::ITT_NUM_DOMAINS] = {};
+
+struct resource_string {
+    const char *str;
+    __itt_string_handle *itt_str_handle;
+};
+
+//
+// populate resource strings
+//
+#define TBB_STRING_RESOURCE( index_name, str ) { str, nullptr },
+static resource_string strings_for_itt[] = {
+    #include "oneapi/tbb/detail/_string_resource.h"
+    { "num_resource_strings", nullptr }
+};
+#undef TBB_STRING_RESOURCE
+
+static __itt_string_handle* ITT_get_string_handle(std::uintptr_t idx) {
+    __TBB_ASSERT(idx < NUM_STRINGS, "string handle out of valid range");
+    return idx < NUM_STRINGS ? strings_for_itt[idx].itt_str_handle : NULL;
+}
+
+static void ITT_init_domains() {
+    tbb_domains[d1::ITT_DOMAIN_MAIN] = __itt_domain_create( _T("tbb") );
+    tbb_domains[d1::ITT_DOMAIN_MAIN]->flags = 1;
+    tbb_domains[d1::ITT_DOMAIN_FLOW] = __itt_domain_create( _T("tbb.flow") );
+    tbb_domains[d1::ITT_DOMAIN_FLOW]->flags = 1;
+    tbb_domains[d1::ITT_DOMAIN_ALGO] = __itt_domain_create( _T("tbb.algorithm") );
+    tbb_domains[d1::ITT_DOMAIN_ALGO]->flags = 1;
+}
+
+static void ITT_init_strings() {
+    for ( std::uintptr_t i = 0; i < NUM_STRINGS; ++i ) {
+#if _WIN32||_WIN64
+        strings_for_itt[i].itt_str_handle = __itt_string_handle_createA( strings_for_itt[i].str );
+#else
+        strings_for_itt[i].itt_str_handle = __itt_string_handle_create( strings_for_itt[i].str );
+#endif
+    }
+}
+
+static void ITT_init() {
+    ITT_init_domains();
+    ITT_init_strings();
+}
+
+/** Thread-unsafe lazy one-time initialization of tools interop.
+    Used by both dummy handlers and general TBB one-time initialization routine. **/
+void ITT_DoUnsafeOneTimeInitialization () {
+    // Double check ITT_InitializationDone is necessary because the first check
+    // in ITT_DoOneTimeInitialization is not guarded with the __TBB_InitOnce lock.
+    if ( !ITT_InitializationDone ) {
+        ITT_Present = (__TBB_load_ittnotify()!=0);
+        if (ITT_Present) ITT_init();
+        ITT_InitializationDone = true;
+    }
+}
+
+/** Thread-safe lazy one-time initialization of tools interop.
+    Used by dummy handlers only. **/
+extern "C"
+void ITT_DoOneTimeInitialization() {
+    if ( !ITT_InitializationDone ) {
+        __TBB_InitOnce::lock();
+        ITT_DoUnsafeOneTimeInitialization();
+        __TBB_InitOnce::unlock();
+    }
+}
+
+void create_itt_sync(void* ptr, const tchar* objtype, const tchar* objname) {
+        ITT_SYNC_CREATE(ptr, objtype, objname);
+}
+
+void call_itt_notify(int t, void *ptr) {
+    switch (t) {
+    case 0: ITT_NOTIFY(sync_prepare, ptr); break;
+    case 1: ITT_NOTIFY(sync_cancel, ptr); break;
+    case 2: ITT_NOTIFY(sync_acquired, ptr); break;
+    case 3: ITT_NOTIFY(sync_releasing, ptr); break;
+    case 4: ITT_NOTIFY(sync_destroy, ptr); break;
+    }
+}
+
+void itt_set_sync_name(void* obj, const tchar* name) {
+    __itt_sync_rename(obj, name);
+}
+
+const __itt_id itt_null_id = { 0, 0, 0 };
+
+static inline __itt_domain* get_itt_domain(d1::itt_domain_enum idx) {
+    if (tbb_domains[idx] == NULL) {
+        ITT_DoOneTimeInitialization();
+    }
+    return tbb_domains[idx];
+}
+
+static inline void itt_id_make(__itt_id* id, void* addr, unsigned long long extra) {
+    *id = __itt_id_make(addr, extra);
+}
+
+static inline void itt_id_create(const __itt_domain* domain, __itt_id id) {
+    __itt_id_create(domain, id);
+}
+
+void itt_make_task_group(d1::itt_domain_enum domain, void* group, unsigned long long group_extra,
+                         void* parent, unsigned long long parent_extra, string_resource_index name_index) {
+    if (__itt_domain* d = get_itt_domain(domain)) {
+        __itt_id group_id = itt_null_id;
+        __itt_id parent_id = itt_null_id;
+        itt_id_make(&group_id, group, group_extra);
+        itt_id_create(d, group_id);
+        if (parent) {
+            itt_id_make(&parent_id, parent, parent_extra);
+        }
+        __itt_string_handle* n = ITT_get_string_handle(name_index);
+        __itt_task_group(d, group_id, parent_id, n);
+    }
+}
+
+void __TBB_EXPORTED_FUNC itt_metadata_str_add(d1::itt_domain_enum domain, void *addr, unsigned long long addr_extra,
+                                              string_resource_index key, const char *value ) {
+    if ( __itt_domain *d = get_itt_domain( domain ) ) {
+        __itt_id id = itt_null_id;
+        itt_id_make( &id, addr, addr_extra );
+        __itt_string_handle *k = ITT_get_string_handle(key);
+        size_t value_length = strlen( value );
+#if _WIN32||_WIN64
+        __itt_metadata_str_addA(d, id, k, value, value_length);
+#else
+        __itt_metadata_str_add(d, id, k, value, value_length);
+#endif
+    }
+}
+
+void __TBB_EXPORTED_FUNC itt_metadata_ptr_add(d1::itt_domain_enum domain, void *addr, unsigned long long addr_extra,
+                                              string_resource_index key, void *value ) {
+    if ( __itt_domain *d = get_itt_domain( domain ) ) {
+        __itt_id id = itt_null_id;
+        itt_id_make( &id, addr, addr_extra );
+        __itt_string_handle *k = ITT_get_string_handle(key);
+#if __TBB_x86_32
+        __itt_metadata_add(d, id, k, __itt_metadata_u32, 1, value);
+#else
+        __itt_metadata_add(d, id, k, __itt_metadata_u64, 1, value);
+#endif
+    }
+}
+
+void __TBB_EXPORTED_FUNC itt_relation_add(d1::itt_domain_enum domain, void *addr0, unsigned long long addr0_extra,
+                                          itt_relation relation, void *addr1, unsigned long long addr1_extra ) {
+    if ( __itt_domain *d = get_itt_domain( domain ) ) {
+        __itt_id id0 = itt_null_id;
+        __itt_id id1 = itt_null_id;
+        itt_id_make( &id0, addr0, addr0_extra );
+        itt_id_make( &id1, addr1, addr1_extra );
+         __itt_relation_add( d, id0, (__itt_relation)relation, id1 );
+    }
+}
+
+void __TBB_EXPORTED_FUNC itt_task_begin(d1::itt_domain_enum domain, void* task, unsigned long long task_extra,
+                    void* parent, unsigned long long parent_extra, string_resource_index name_index) {
+    if (__itt_domain* d = get_itt_domain(domain)) {
+        __itt_id task_id = itt_null_id;
+        __itt_id parent_id = itt_null_id;
+        if (task) {
+            itt_id_make(&task_id, task, task_extra);
+        }
+        if (parent) {
+            itt_id_make(&parent_id, parent, parent_extra);
+        }
+        __itt_string_handle* n = ITT_get_string_handle(name_index);
+        __itt_task_begin(d, task_id, parent_id, n);
+    }
+}
+
+void __TBB_EXPORTED_FUNC itt_task_end(d1::itt_domain_enum domain) {
+    if (__itt_domain* d = get_itt_domain(domain)) {
+        __itt_task_end(d);
+    }
+}
+
+void __TBB_EXPORTED_FUNC itt_region_begin(d1::itt_domain_enum domain, void *region, unsigned long long region_extra,
+                      void *parent, unsigned long long parent_extra, string_resource_index /* name_index */ ) {
+    if ( __itt_domain *d = get_itt_domain( domain ) ) {
+        __itt_id region_id = itt_null_id;
+        __itt_id parent_id = itt_null_id;
+        itt_id_make( &region_id, region, region_extra );
+        if ( parent ) {
+            itt_id_make( &parent_id, parent, parent_extra );
+        }
+         __itt_region_begin( d, region_id, parent_id, NULL );
+    }
+}
+
+void __TBB_EXPORTED_FUNC itt_region_end(d1::itt_domain_enum domain, void *region, unsigned long long region_extra ) {
+    if ( __itt_domain *d = get_itt_domain( domain ) ) {
+        __itt_id region_id = itt_null_id;
+        itt_id_make( &region_id, region, region_extra );
+         __itt_region_end( d, region_id );
+    }
+}
+
+#else
+void create_itt_sync(void* /*ptr*/, const tchar* /*objtype*/, const tchar* /*objname*/) {}
+void call_itt_notify(int /*t*/, void* /*ptr*/) {}
+void itt_set_sync_name(void* /*obj*/, const tchar* /*name*/) {}
+void itt_make_task_group(d1::itt_domain_enum /*domain*/, void* /*group*/, unsigned long long /*group_extra*/,
+                         void* /*parent*/, unsigned long long /*parent_extra*/, string_resource_index /*name_index*/) {}
+void itt_metadata_str_add(d1::itt_domain_enum /*domain*/, void* /*addr*/, unsigned long long /*addr_extra*/,
+                          string_resource_index /*key*/, const char* /*value*/ ) { }
+void itt_metadata_ptr_add(d1::itt_domain_enum /*domain*/, void * /*addr*/, unsigned long long /*addr_extra*/,
+                          string_resource_index /*key*/, void * /*value*/ ) {}
+void itt_relation_add(d1::itt_domain_enum /*domain*/, void* /*addr0*/, unsigned long long /*addr0_extra*/,
+                      itt_relation /*relation*/, void* /*addr1*/, unsigned long long /*addr1_extra*/ ) { }
+void itt_task_begin(d1::itt_domain_enum /*domain*/, void* /*task*/, unsigned long long /*task_extra*/,
+                        void* /*parent*/, unsigned long long /*parent_extra*/, string_resource_index /*name_index*/ ) { }
+void itt_task_end(d1::itt_domain_enum /*domain*/ ) { }
+void itt_region_begin(d1::itt_domain_enum /*domain*/, void* /*region*/, unsigned long long /*region_extra*/,
+                          void* /*parent*/, unsigned long long /*parent_extra*/, string_resource_index /*name_index*/ ) { }
+void itt_region_end(d1::itt_domain_enum /*domain*/, void* /*region*/, unsigned long long /*region_extra*/ ) { }
+#endif /* __TBB_USE_ITT_NOTIFY */
+
+const tchar
+    *SyncType_Scheduler = _T("%Constant")
+    ;
+const tchar
+    *SyncObj_ContextsList = _T("TBB Scheduler")
+    ;
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/contrib/libs/tbb/src/tbb/queuing_rw_mutex.cpp b/contrib/libs/tbb/src/tbb/queuing_rw_mutex.cpp
new file mode 100644
index 0000000000..cfdc4d3c2a
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/queuing_rw_mutex.cpp
@@ -0,0 +1,558 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+/** Before making any changes in the implementation, please emulate algorithmic changes
+    with SPIN tool using <TBB directory>/tools/spin_models/ReaderWriterMutex.pml.
+    There could be some code looking as "can be restructured" but its structure does matter! */
+
+#include "oneapi/tbb/queuing_rw_mutex.h"
+#include "oneapi/tbb/detail/_assert.h"
+#include "oneapi/tbb/detail/_utils.h"
+#include "itt_notify.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+    // Workaround for overzealous compiler warnings
+    #pragma warning (push)
+    #pragma warning (disable: 4311 4312)
+#endif
+
+//! A view of a T* with additional functionality for twiddling low-order bits.
+template<typename T>
+class tricky_atomic_pointer {
+public:
+    using word = uintptr_t;
+
+    static T* fetch_add( std::atomic<word>& location, word addend, std::memory_order memory_order ) {
+        return reinterpret_cast<T*>(location.fetch_add(addend, memory_order));
+    }
+
+    static T* exchange( std::atomic<word>& location, T* value, std::memory_order memory_order ) {
+        return reinterpret_cast<T*>(location.exchange(reinterpret_cast<word>(value), memory_order));
+    }
+
+    static T* compare_exchange_strong( std::atomic<word>& obj, const T* expected, const T* desired, std::memory_order memory_order ) {
+        word expd = reinterpret_cast<word>(expected);
+        obj.compare_exchange_strong(expd, reinterpret_cast<word>(desired), memory_order);
+        return reinterpret_cast<T*>(expd);
+    }
+
+    static void store( std::atomic<word>& location, const T* value, std::memory_order memory_order ) {
+        location.store(reinterpret_cast<word>(value), memory_order);
+    }
+
+    static T* load( std::atomic<word>& location, std::memory_order memory_order ) {
+        return reinterpret_cast<T*>(location.load(memory_order));
+    }
+
+    static void spin_wait_while_eq(const std::atomic<word>& location, const T* value) {
+        tbb::detail::d0::spin_wait_while_eq(location, reinterpret_cast<word>(value) );
+    }
+
+    T* & ref;
+    tricky_atomic_pointer( T*& original ) : ref(original) {};
+    tricky_atomic_pointer(const tricky_atomic_pointer&) = delete;
+    tricky_atomic_pointer& operator=(const tricky_atomic_pointer&) = delete;
+    T* operator&( const word operand2 ) const {
+        return reinterpret_cast<T*>( reinterpret_cast<word>(ref) & operand2 );
+    }
+    T* operator|( const word operand2 ) const {
+        return reinterpret_cast<T*>( reinterpret_cast<word>(ref) | operand2 );
+    }
+};
+
+using tricky_pointer = tricky_atomic_pointer<queuing_rw_mutex::scoped_lock>;
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+    // Workaround for overzealous compiler warnings
+    #pragma warning (pop)
+#endif
+
+//! Flag bits in a state_t that specify information about a locking request.
+enum state_t_flags : unsigned char {
+    STATE_NONE                   = 0,
+    STATE_WRITER                 = 1<<0,
+    STATE_READER                 = 1<<1,
+    STATE_READER_UNBLOCKNEXT     = 1<<2,
+    STATE_ACTIVEREADER           = 1<<3,
+    STATE_UPGRADE_REQUESTED      = 1<<4,
+    STATE_UPGRADE_WAITING        = 1<<5,
+    STATE_UPGRADE_LOSER          = 1<<6,
+    STATE_COMBINED_WAITINGREADER = STATE_READER | STATE_READER_UNBLOCKNEXT,
+    STATE_COMBINED_READER        = STATE_COMBINED_WAITINGREADER | STATE_ACTIVEREADER,
+    STATE_COMBINED_UPGRADING     = STATE_UPGRADE_WAITING | STATE_UPGRADE_LOSER
+};
+
+static const unsigned char RELEASED = 0;
+static const unsigned char ACQUIRED = 1;
+
+struct queuing_rw_mutex_impl {
+    //! Try to acquire the internal lock
+    /** Returns true if lock was successfully acquired. */
+    static bool try_acquire_internal_lock(d1::queuing_rw_mutex::scoped_lock& s)
+    {
+        auto expected = RELEASED;
+        return s.my_internal_lock.compare_exchange_strong(expected, ACQUIRED);
+    }
+
+    //! Acquire the internal lock
+    static void acquire_internal_lock(d1::queuing_rw_mutex::scoped_lock& s)
+    {
+        // Usually, we would use the test-test-and-set idiom here, with exponential backoff.
+        // But so far, experiments indicate there is no value in doing so here.
+        while( !try_acquire_internal_lock(s) ) {
+            machine_pause(1);
+        }
+    }
+
+    //! Release the internal lock
+    static void release_internal_lock(d1::queuing_rw_mutex::scoped_lock& s)
+    {
+        s.my_internal_lock.store(RELEASED, std::memory_order_release);
+    }
+
+    //! Wait for internal lock to be released
+    static void wait_for_release_of_internal_lock(d1::queuing_rw_mutex::scoped_lock& s)
+    {
+        spin_wait_until_eq(s.my_internal_lock, RELEASED);
+    }
+
+    //! A helper function
+    static void unblock_or_wait_on_internal_lock(d1::queuing_rw_mutex::scoped_lock& s, uintptr_t flag ) {
+        if( flag ) {
+            wait_for_release_of_internal_lock(s);
+        }
+        else {
+            release_internal_lock(s);
+        }
+    }
+
+    //! Mask for low order bit of a pointer.
+    static const tricky_pointer::word FLAG = 0x1;
+
+    static uintptr_t get_flag( d1::queuing_rw_mutex::scoped_lock* ptr ) {
+        return reinterpret_cast<uintptr_t>(ptr) & FLAG;
+    }
+
+    //------------------------------------------------------------------------
+    // Methods of queuing_rw_mutex::scoped_lock
+    //------------------------------------------------------------------------
+
+    //! A method to acquire queuing_rw_mutex lock
+    static void acquire(d1::queuing_rw_mutex& m, d1::queuing_rw_mutex::scoped_lock& s, bool write)
+    {
+        __TBB_ASSERT( !s.my_mutex, "scoped_lock is already holding a mutex");
+
+        // Must set all fields before the exchange, because once the
+        // exchange executes, *this becomes accessible to other threads.
+        s.my_mutex = &m;
+        s.my_prev.store(0U, std::memory_order_relaxed);
+        s.my_next.store(0U, std::memory_order_relaxed);
+        s.my_going.store(0U, std::memory_order_relaxed);
+        s.my_state.store(d1::queuing_rw_mutex::scoped_lock::state_t(write ? STATE_WRITER : STATE_READER), std::memory_order_relaxed);
+        s.my_internal_lock.store(RELEASED, std::memory_order_relaxed);
+
+        queuing_rw_mutex::scoped_lock* predecessor = m.q_tail.exchange(&s, std::memory_order_release);
+
+        if( write ) {       // Acquiring for write
+
+            if( predecessor ) {
+                ITT_NOTIFY(sync_prepare, s.my_mutex);
+                predecessor = tricky_pointer(predecessor) & ~FLAG;
+                __TBB_ASSERT( !( tricky_pointer(predecessor) & FLAG ), "use of corrupted pointer!" );
+    #if TBB_USE_ASSERT
+                atomic_fence(std::memory_order_seq_cst); // on "m.q_tail"
+                __TBB_ASSERT( !predecessor->my_next, "the predecessor has another successor!");
+    #endif
+                tricky_pointer::store(predecessor->my_next, &s, std::memory_order_release);
+                spin_wait_until_eq(s.my_going, 1U);
+            }
+
+        } else {            // Acquiring for read
+    #if __TBB_USE_ITT_NOTIFY
+            bool sync_prepare_done = false;
+    #endif
+            if( predecessor ) {
+                unsigned char pred_state;
+                __TBB_ASSERT( !s.my_prev, "the predecessor is already set" );
+                if( tricky_pointer(predecessor) & FLAG ) {
+                    /* this is only possible if predecessor is an upgrading reader and it signals us to wait */
+                    pred_state = STATE_UPGRADE_WAITING;
+                    predecessor = tricky_pointer(predecessor) & ~FLAG;
+                } else {
+                    // Load predecessor->my_state now, because once predecessor->my_next becomes
+                    // non-NULL, we must assume that *predecessor might be destroyed.
+                    pred_state = STATE_READER;
+                    predecessor->my_state.compare_exchange_strong(pred_state, STATE_READER_UNBLOCKNEXT, std::memory_order_acq_rel);
+                }
+                tricky_pointer::store(s.my_prev, predecessor, std::memory_order_relaxed);
+                __TBB_ASSERT( !( tricky_pointer(predecessor) & FLAG ), "use of corrupted pointer!" );
+    #if TBB_USE_ASSERT
+                atomic_fence(std::memory_order_seq_cst); // on "m.q_tail"
+                __TBB_ASSERT( !predecessor->my_next, "the predecessor has another successor!");
+    #endif
+                tricky_pointer::store(predecessor->my_next, &s, std::memory_order_release);
+                if( pred_state != STATE_ACTIVEREADER ) {
+    #if __TBB_USE_ITT_NOTIFY
+                    sync_prepare_done = true;
+                    ITT_NOTIFY(sync_prepare, s.my_mutex);
+    #endif
+                    spin_wait_until_eq(s.my_going, 1U);
+                }
+            }
+
+            // The protected state must have been acquired here before it can be further released to any other reader(s):
+            unsigned char old_state = STATE_READER;
+            s.my_state.compare_exchange_strong(old_state, STATE_ACTIVEREADER, std::memory_order_acq_rel);
+            if( old_state!=STATE_READER ) {
+#if __TBB_USE_ITT_NOTIFY
+                if( !sync_prepare_done )
+                    ITT_NOTIFY(sync_prepare, s.my_mutex);
+#endif
+                // Failed to become active reader -> need to unblock the next waiting reader first
+                __TBB_ASSERT( s.my_state==STATE_READER_UNBLOCKNEXT, "unexpected state" );
+                spin_wait_while_eq(s.my_next, 0U);
+                /* my_state should be changed before unblocking the next otherwise it might finish
+                   and another thread can get our old state and left blocked */
+                s.my_state.store(STATE_ACTIVEREADER, std::memory_order_relaxed);
+                tricky_pointer::load(s.my_next, std::memory_order_relaxed)->my_going.store(1U, std::memory_order_release);
+            }
+            __TBB_ASSERT( s.my_state==STATE_ACTIVEREADER, "unlocked reader is active reader" );
+        }
+
+        ITT_NOTIFY(sync_acquired, s.my_mutex);
+
+        // Force acquire so that user's critical section receives correct values
+        // from processor that was previously in the user's critical section.
+        atomic_fence(std::memory_order_acquire);
+    }
+
+    //! A method to acquire queuing_rw_mutex if it is free
+    static bool try_acquire(d1::queuing_rw_mutex& m, d1::queuing_rw_mutex::scoped_lock& s, bool write)
+    {
+        __TBB_ASSERT( !s.my_mutex, "scoped_lock is already holding a mutex");
+
+        if( m.q_tail.load(std::memory_order_relaxed) )
+            return false; // Someone already took the lock
+
+        // Must set all fields before the exchange, because once the
+        // exchange executes, *this becomes accessible to other threads.
+        s.my_prev.store(0U, std::memory_order_relaxed);
+        s.my_next.store(0U, std::memory_order_relaxed);
+        s.my_going.store(0U, std::memory_order_relaxed); // TODO: remove dead assignment?
+        s.my_state.store(d1::queuing_rw_mutex::scoped_lock::state_t(write ? STATE_WRITER : STATE_ACTIVEREADER), std::memory_order_relaxed);
+        s.my_internal_lock.store(RELEASED, std::memory_order_relaxed);
+
+        // The CAS must have release semantics, because we are
+        // "sending" the fields initialized above to other processors.
+        d1::queuing_rw_mutex::scoped_lock* expected = nullptr;
+        if( !m.q_tail.compare_exchange_strong(expected, &s, std::memory_order_release) )
+            return false; // Someone already took the lock
+        // Force acquire so that user's critical section receives correct values
+        // from processor that was previously in the user's critical section.
+        atomic_fence(std::memory_order_acquire);
+        s.my_mutex = &m;
+        ITT_NOTIFY(sync_acquired, s.my_mutex);
+        return true;
+    }
+
+    //! A method to release queuing_rw_mutex lock
+    static void release(d1::queuing_rw_mutex::scoped_lock& s) {
+        __TBB_ASSERT(s.my_mutex!=nullptr, "no lock acquired");
+
+        ITT_NOTIFY(sync_releasing, s.my_mutex);
+
+        if( s.my_state.load(std::memory_order_relaxed) == STATE_WRITER ) { // Acquired for write
+
+            // The logic below is the same as "writerUnlock", but elides
+            // "return" from the middle of the routine.
+            // In the statement below, acquire semantics of reading my_next is required
+            // so that following operations with fields of my_next are safe.
+            d1::queuing_rw_mutex::scoped_lock* next = tricky_pointer::load(s.my_next, std::memory_order_acquire);
+            if( !next ) {
+                d1::queuing_rw_mutex::scoped_lock* expected = &s;
+                if( s.my_mutex->q_tail.compare_exchange_strong(expected, nullptr, std::memory_order_release) ) {
+                    // this was the only item in the queue, and the queue is now empty.
+                    goto done;
+                }
+                spin_wait_while_eq( s.my_next, 0U );
+                next = tricky_pointer::load(s.my_next, std::memory_order_acquire);
+            }
+            next->my_going.store(2U, std::memory_order_relaxed); // protect next queue node from being destroyed too early
+            if( next->my_state==STATE_UPGRADE_WAITING ) {
+                // the next waiting for upgrade means this writer was upgraded before.
+                acquire_internal_lock(s);
+                // Responsibility transition, the one who reads uncorrupted my_prev will do release.
+                d1::queuing_rw_mutex::scoped_lock* tmp = tricky_pointer::exchange(next->my_prev, nullptr, std::memory_order_release);
+                next->my_state.store(STATE_UPGRADE_LOSER, std::memory_order_relaxed);
+                next->my_going.store(1U, std::memory_order_release);
+                unblock_or_wait_on_internal_lock(s, get_flag(tmp));
+            } else {
+                // next->state cannot be STATE_UPGRADE_REQUESTED
+                __TBB_ASSERT( next->my_state & (STATE_COMBINED_WAITINGREADER | STATE_WRITER), "unexpected state" );
+                __TBB_ASSERT( !( next->my_prev.load() & FLAG ), "use of corrupted pointer!" );
+                tricky_pointer::store(next->my_prev, nullptr, std::memory_order_relaxed);
+                next->my_going.store(1U, std::memory_order_release);
+            }
+
+        } else { // Acquired for read
+
+            queuing_rw_mutex::scoped_lock *tmp = nullptr;
+    retry:
+            // Addition to the original paper: Mark my_prev as in use
+            queuing_rw_mutex::scoped_lock *predecessor = tricky_pointer::fetch_add(s.my_prev, FLAG, std::memory_order_acquire);
+
+            if( predecessor ) {
+                if( !(try_acquire_internal_lock(*predecessor)) )
+                {
+                    // Failed to acquire the lock on predecessor. The predecessor either unlinks or upgrades.
+                    // In the second case, it could or could not know my "in use" flag - need to check
+                    // Responsibility transition, the one who reads uncorrupted my_prev will do release.
+                    tmp = tricky_pointer::compare_exchange_strong(s.my_prev, tricky_pointer(predecessor) | FLAG, predecessor, std::memory_order_release);
+                    if( !(tricky_pointer(tmp) & FLAG) ) {
+                        // Wait for the predecessor to change my_prev (e.g. during unlink)
+                        // TODO: spin_wait condition seems never reachable
+                        tricky_pointer::spin_wait_while_eq( s.my_prev, tricky_pointer(predecessor)|FLAG );
+                        // Now owner of predecessor is waiting for _us_ to release its lock
+                        release_internal_lock(*predecessor);
+                    }
+                    // else the "in use" flag is back -> the predecessor didn't get it and will release itself; nothing to do
+
+                    tmp = nullptr;
+                    goto retry;
+                }
+                __TBB_ASSERT(predecessor && predecessor->my_internal_lock.load(std::memory_order_relaxed)==ACQUIRED, "predecessor's lock is not acquired");
+                tricky_pointer::store(s.my_prev, predecessor, std::memory_order_relaxed);
+                acquire_internal_lock(s);
+
+                tricky_pointer::store(predecessor->my_next, nullptr, std::memory_order_release);
+
+                d1::queuing_rw_mutex::scoped_lock* expected = &s;
+                if( !tricky_pointer::load(s.my_next, std::memory_order_relaxed) && !s.my_mutex->q_tail.compare_exchange_strong(expected, predecessor, std::memory_order_release) ) {
+                    spin_wait_while_eq( s.my_next, 0U );
+                }
+                __TBB_ASSERT( !(s.my_next.load() & FLAG), "use of corrupted pointer" );
+
+                // ensure acquire semantics of reading 'my_next'
+                if(d1::queuing_rw_mutex::scoped_lock *const l_next = tricky_pointer::load(s.my_next, std::memory_order_acquire) ) { // I->next != nil, TODO: rename to next after clearing up and adapting the n in the comment two lines below
+                    // Equivalent to I->next->prev = I->prev but protected against (prev[n]&FLAG)!=0
+                    tmp = tricky_pointer::exchange(l_next->my_prev, predecessor, std::memory_order_release);
+                    // I->prev->next = I->next;
+                    __TBB_ASSERT(tricky_pointer::load(s.my_prev, std::memory_order_relaxed)==predecessor, nullptr);
+                    predecessor->my_next.store(s.my_next.load(std::memory_order_relaxed), std::memory_order_release);
+                }
+                // Safe to release in the order opposite to acquiring which makes the code simpler
+                release_internal_lock(*predecessor);
+
+            } else { // No predecessor when we looked
+                acquire_internal_lock(s);  // "exclusiveLock(&I->EL)"
+                d1::queuing_rw_mutex::scoped_lock* next = tricky_pointer::load(s.my_next, std::memory_order_acquire);
+                if( !next ) {
+                    d1::queuing_rw_mutex::scoped_lock* expected = &s;
+                    if( !s.my_mutex->q_tail.compare_exchange_strong(expected, nullptr, std::memory_order_release) ) {
+                        spin_wait_while_eq( s.my_next, 0U );
+                        next = tricky_pointer::load(s.my_next, std::memory_order_relaxed);
+                    } else {
+                        goto unlock_self;
+                    }
+                }
+                next->my_going.store(2U, std::memory_order_relaxed);
+                // Responsibility transition, the one who reads uncorrupted my_prev will do release.
+                tmp = tricky_pointer::exchange(next->my_prev, nullptr, std::memory_order_release);
+                next->my_going.store(1U, std::memory_order_release);
+            }
+    unlock_self:
+            unblock_or_wait_on_internal_lock(s, get_flag(tmp));
+        }
+    done:
+        spin_wait_while_eq( s.my_going, 2U );
+
+        s.initialize();
+    }
+
+    static bool downgrade_to_reader(d1::queuing_rw_mutex::scoped_lock& s) {
+        if ( s.my_state.load(std::memory_order_relaxed) == STATE_ACTIVEREADER ) return true; // Already a reader
+
+        ITT_NOTIFY(sync_releasing, s.my_mutex);
+        s.my_state.store(STATE_READER, std::memory_order_relaxed);
+        if( ! tricky_pointer::load(s.my_next, std::memory_order_relaxed)) {
+            // the following load of q_tail must not be reordered with setting STATE_READER above
+            if( &s==s.my_mutex->q_tail.load() ) {
+                unsigned char old_state = STATE_READER;
+                s.my_state.compare_exchange_strong(old_state, STATE_ACTIVEREADER, std::memory_order_release);
+                if( old_state==STATE_READER )
+                    return true; // Downgrade completed
+            }
+            /* wait for the next to register */
+            spin_wait_while_eq( s.my_next, 0U );
+        }
+        d1::queuing_rw_mutex::scoped_lock *const next = tricky_pointer::load(s.my_next, std::memory_order_acquire);
+        __TBB_ASSERT( next, "still no successor at this point!" );
+        if( next->my_state & STATE_COMBINED_WAITINGREADER )
+            next->my_going.store(1U, std::memory_order_release);
+        else if( next->my_state==STATE_UPGRADE_WAITING )
+            // the next waiting for upgrade means this writer was upgraded before.
+            next->my_state.store(STATE_UPGRADE_LOSER, std::memory_order_relaxed);
+        s.my_state.store(STATE_ACTIVEREADER, std::memory_order_relaxed);;
+        return true;
+    }
+
+    static bool upgrade_to_writer(d1::queuing_rw_mutex::scoped_lock& s) {
+        if ( s.my_state.load(std::memory_order_relaxed) == STATE_WRITER ) return true; // Already a writer
+
+        __TBB_ASSERT( s.my_state==STATE_ACTIVEREADER, "only active reader can be updated" );
+
+        queuing_rw_mutex::scoped_lock * tmp;
+        queuing_rw_mutex::scoped_lock * me = &s;
+
+        ITT_NOTIFY(sync_releasing, s.my_mutex);
+        s.my_state.store(STATE_UPGRADE_REQUESTED, std::memory_order_relaxed);
+    requested:
+        __TBB_ASSERT( !(s.my_next.load() & FLAG), "use of corrupted pointer!" );
+        acquire_internal_lock(s);
+        d1::queuing_rw_mutex::scoped_lock* expected = &s;
+        if( !s.my_mutex->q_tail.compare_exchange_strong(expected, tricky_pointer(me)|FLAG, std::memory_order_release) ) {
+            spin_wait_while_eq( s.my_next, 0U );
+            queuing_rw_mutex::scoped_lock * next;
+            next = tricky_pointer::fetch_add(s.my_next, FLAG, std::memory_order_acquire);
+            unsigned short n_state = next->my_state;
+            /* the next reader can be blocked by our state. the best thing to do is to unblock it */
+            if( n_state & STATE_COMBINED_WAITINGREADER )
+                next->my_going.store(1U, std::memory_order_release);
+            // Responsibility transition, the one who reads uncorrupted my_prev will do release.
+            tmp = tricky_pointer::exchange(next->my_prev, &s, std::memory_order_release);
+            unblock_or_wait_on_internal_lock(s, get_flag(tmp));
+            if( n_state & (STATE_COMBINED_READER | STATE_UPGRADE_REQUESTED) ) {
+                // save next|FLAG for simplicity of following comparisons
+                tmp = tricky_pointer(next)|FLAG;
+                for( atomic_backoff b; tricky_pointer::load(s.my_next, std::memory_order_relaxed)==tmp; b.pause() ) {
+                    if( s.my_state & STATE_COMBINED_UPGRADING ) {
+                        if( tricky_pointer::load(s.my_next, std::memory_order_acquire)==tmp )
+                            tricky_pointer::store(s.my_next, next, std::memory_order_relaxed);
+                        goto waiting;
+                    }
+                }
+                __TBB_ASSERT(tricky_pointer::load(s.my_next, std::memory_order_relaxed) != (tricky_pointer(next)|FLAG), nullptr);
+                goto requested;
+            } else {
+                __TBB_ASSERT( n_state & (STATE_WRITER | STATE_UPGRADE_WAITING), "unexpected state");
+                __TBB_ASSERT( (tricky_pointer(next)|FLAG) == tricky_pointer::load(s.my_next, std::memory_order_relaxed), nullptr);
+                tricky_pointer::store(s.my_next, next, std::memory_order_relaxed);
+            }
+        } else {
+            /* We are in the tail; whoever comes next is blocked by q_tail&FLAG */
+            release_internal_lock(s);
+        } // if( this != my_mutex->q_tail... )
+        {
+            unsigned char old_state = STATE_UPGRADE_REQUESTED;
+            s.my_state.compare_exchange_strong(old_state, STATE_UPGRADE_WAITING, std::memory_order_acquire);
+        }
+    waiting:
+        __TBB_ASSERT( !( s.my_next.load(std::memory_order_relaxed) & FLAG ), "use of corrupted pointer!" );
+        __TBB_ASSERT( s.my_state & STATE_COMBINED_UPGRADING, "wrong state at upgrade waiting_retry" );
+        __TBB_ASSERT( me==&s, nullptr );
+        ITT_NOTIFY(sync_prepare, s.my_mutex);
+        /* if no one was blocked by the "corrupted" q_tail, turn it back */
+        expected = tricky_pointer(me)|FLAG;
+        s.my_mutex->q_tail.compare_exchange_strong(expected, &s, std::memory_order_release);
+        queuing_rw_mutex::scoped_lock * predecessor;
+        // Mark my_prev as 'in use' to prevent predecessor from releasing
+        predecessor = tricky_pointer::fetch_add(s.my_prev, FLAG, std::memory_order_acquire);
+        if( predecessor ) {
+            bool success = try_acquire_internal_lock(*predecessor);
+            {
+                // While the predecessor pointer (my_prev) is in use (FLAG is set), we can safely update the node`s state.
+                // Corrupted pointer transitions responsibility to release the predecessor`s node on us.
+                unsigned char old_state = STATE_UPGRADE_REQUESTED;
+                predecessor->my_state.compare_exchange_strong(old_state, STATE_UPGRADE_WAITING, std::memory_order_release);
+            }
+            if( !success ) {
+                // Responsibility transition, the one who reads uncorrupted my_prev will do release.
+                tmp = tricky_pointer::compare_exchange_strong(s.my_prev, tricky_pointer(predecessor)|FLAG, predecessor, std::memory_order_release);
+                if( tricky_pointer(tmp) & FLAG ) {
+                    tricky_pointer::spin_wait_while_eq(s.my_prev, predecessor);
+                    predecessor = tricky_pointer::load(s.my_prev, std::memory_order_relaxed);
+                } else {
+                    // TODO: spin_wait condition seems never reachable
+                    tricky_pointer::spin_wait_while_eq(s.my_prev, tricky_pointer(predecessor)|FLAG);
+                    release_internal_lock(*predecessor);
+                }
+            } else {
+                tricky_pointer::store(s.my_prev, predecessor, std::memory_order_relaxed);
+                release_internal_lock(*predecessor);
+                tricky_pointer::spin_wait_while_eq(s.my_prev, predecessor);
+                predecessor = tricky_pointer::load(s.my_prev, std::memory_order_relaxed);
+            }
+            if( predecessor )
+                goto waiting;
+        } else {
+            tricky_pointer::store(s.my_prev, nullptr, std::memory_order_relaxed);
+        }
+        __TBB_ASSERT( !predecessor && !s.my_prev, nullptr );
+
+        // additional lifetime issue prevention checks
+        // wait for the successor to finish working with my fields
+        wait_for_release_of_internal_lock(s);
+        // now wait for the predecessor to finish working with my fields
+        spin_wait_while_eq( s.my_going, 2U );
+
+        // Acquire critical section indirectly from previous owner or directly from predecessor (TODO: not clear).
+        atomic_fence(std::memory_order_acquire); // on either "my_mutex->q_tail" or "my_going" (TODO: not clear)
+
+        bool result = ( s.my_state != STATE_UPGRADE_LOSER );
+        s.my_state.store(STATE_WRITER, std::memory_order_relaxed);
+        s.my_going.store(1U, std::memory_order_relaxed);
+
+        ITT_NOTIFY(sync_acquired, s.my_mutex);
+        return result;
+    }
+
+    static void construct(d1::queuing_rw_mutex& m) {
+        suppress_unused_warning(m);
+        ITT_SYNC_CREATE(&m, _T("tbb::queuing_rw_mutex"), _T(""));
+    }
+};
+
+void __TBB_EXPORTED_FUNC acquire(d1::queuing_rw_mutex& m, d1::queuing_rw_mutex::scoped_lock& s, bool write) {
+    queuing_rw_mutex_impl::acquire(m, s, write);
+}
+
+bool __TBB_EXPORTED_FUNC try_acquire(d1::queuing_rw_mutex& m, d1::queuing_rw_mutex::scoped_lock& s, bool write) {
+    return queuing_rw_mutex_impl::try_acquire(m, s, write);
+}
+
+void __TBB_EXPORTED_FUNC release(d1::queuing_rw_mutex::scoped_lock& s) {
+    queuing_rw_mutex_impl::release(s);
+}
+
+bool __TBB_EXPORTED_FUNC upgrade_to_writer(d1::queuing_rw_mutex::scoped_lock& s) {
+    return queuing_rw_mutex_impl::upgrade_to_writer(s);
+}
+
+bool __TBB_EXPORTED_FUNC downgrade_to_reader(d1::queuing_rw_mutex::scoped_lock& s) {
+    return queuing_rw_mutex_impl::downgrade_to_reader(s);
+}
+
+void __TBB_EXPORTED_FUNC construct(d1::queuing_rw_mutex& m) {
+    queuing_rw_mutex_impl::construct(m);
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/contrib/libs/tbb/src/tbb/rml_base.h b/contrib/libs/tbb/src/tbb/rml_base.h
new file mode 100644
index 0000000000..9e1705837c
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/rml_base.h
@@ -0,0 +1,163 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+// Header guard and namespace names follow rml conventions.
+
+#ifndef __RML_rml_base_H
+#define __RML_rml_base_H
+
+#include <cstddef>
+
+#if _WIN32||_WIN64
+#include <windows.h>
+#endif /* _WIN32||_WIN64 */
+
+#ifdef RML_PURE_VIRTUAL_HANDLER
+#define RML_PURE(T) {RML_PURE_VIRTUAL_HANDLER(); return (T)0;}
+#else
+#define RML_PURE(T) = 0;
+#endif
+
+namespace rml {
+
+class server;
+
+class versioned_object {
+public:
+    //! A version number
+    typedef unsigned version_type;
+
+    virtual ~versioned_object() {}
+
+    //! Get version of this object
+    /** The version number is incremented when a incompatible change is introduced.
+        The version number is invariant for the lifetime of the object. */
+    virtual version_type version() const RML_PURE(version_type)
+
+};
+
+//! Represents a client's job for an execution context.
+/** A job object is constructed by the client.
+    Not derived from versioned_object because version is same as for client. */
+class job {
+    friend class server;
+};
+
+//! Information that client provides to server when asking for a server.
+/** The instance must endure at least until acknowledge_close_connection is called. */
+class client: public versioned_object {
+public:
+    //! Typedef for convenience of derived classes in other namespaces.
+    typedef ::rml::job job;
+
+    //! Index of a job in a job pool
+    typedef unsigned size_type;
+
+    //! Maximum number of threads that client can exploit profitably if nothing else is running on the machine.
+    /** The returned value should remain invariant for the lifetime of the connection.  [idempotent] */
+    virtual size_type max_job_count() const RML_PURE(size_type)
+
+    //! Minimum stack size for each job.  0 means to use default stack size. [idempotent]
+    virtual std::size_t min_stack_size() const RML_PURE(std::size_t)
+
+    //! Server calls this routine when it needs client to create a job object.
+    virtual job* create_one_job() RML_PURE(job*)
+
+    //! Acknowledge that all jobs have been cleaned up.
+    /** Called by server in response to request_close_connection
+        after cleanup(job) has been called for each job. */
+    virtual void acknowledge_close_connection() RML_PURE(void)
+
+    //! Inform client that server is done with *this.
+    /** Client should destroy the job.
+        Not necessarily called by execution context represented by *this.
+        Never called while any other thread is working on the job. */
+    virtual void cleanup( job& ) RML_PURE(void)
+
+    // In general, we should not add new virtual methods, because that would
+    // break derived classes.  Think about reserving some vtable slots.
+};
+
+// Information that server provides to client.
+// Virtual functions are routines provided by the server for the client to call.
+class server: public versioned_object {
+public:
+    //! Typedef for convenience of derived classes.
+    typedef ::rml::job job;
+
+#if _WIN32||_WIN64
+    typedef void* execution_resource_t;
+#endif
+
+    //! Request that connection to server be closed.
+    /** Causes each job associated with the client to have its cleanup method called,
+        possibly by a thread different than the thread that created the job.
+        This method can return before all cleanup methods return.
+        Actions that have to wait after all cleanup methods return should be part of
+        client::acknowledge_close_connection.
+        Pass true as exiting if request_close_connection() is called because exit() is
+        called. In that case, it is the client's responsibility to make sure all threads
+        are terminated. In all other cases, pass false.  */
+    virtual void request_close_connection( bool exiting = false ) = 0;
+
+    //! Called by client thread when it reaches a point where it cannot make progress until other threads do.
+    virtual void yield() = 0;
+
+    //! Called by client to indicate a change in the number of non-RML threads that are running.
+    /** This is a performance hint to the RML to adjust how many threads it should let run
+        concurrently.  The delta is the change in the number of non-RML threads that are running.
+        For example, a value of 1 means the client has started running another thread, and a value
+        of -1 indicates that the client has blocked or terminated one of its threads. */
+    virtual void independent_thread_number_changed( int delta ) = 0;
+
+    //! Default level of concurrency for which RML strives when there are no non-RML threads running.
+    /** Normally, the value is the hardware concurrency minus one.
+        The "minus one" accounts for the thread created by main(). */
+    virtual unsigned default_concurrency() const = 0;
+};
+
+class factory {
+public:
+    //! status results
+    enum status_type {
+        st_success=0,
+        st_connection_exists,
+        st_not_found,
+        st_incompatible
+    };
+
+protected:
+    //! Pointer to routine that waits for server to indicate when client can close itself.
+    status_type (*my_wait_to_close_routine)( factory& );
+
+public:
+    //! Library handle for use by RML.
+#if _WIN32||_WIN64
+    HMODULE library_handle;
+#else
+    void* library_handle;
+#endif /* _WIN32||_WIN64 */
+
+    //! Special marker to keep dll from being unloaded prematurely
+    static const std::size_t c_dont_unload = 1;
+};
+
+//! Typedef for callback functions to print server info
+typedef void (*server_info_callback_t)( void* arg, const char* server_info );
+
+} // namespace rml
+
+#endif /* __RML_rml_base_H */
diff --git a/contrib/libs/tbb/src/tbb/rml_tbb.cpp b/contrib/libs/tbb/src/tbb/rml_tbb.cpp
new file mode 100644
index 0000000000..122e2709f7
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/rml_tbb.cpp
@@ -0,0 +1,113 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "oneapi/tbb/detail/_assert.h"
+
+#include "rml_tbb.h"
+#include "dynamic_link.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+namespace rml {
+
+#define MAKE_SERVER(x) DLD(__TBB_make_rml_server,x)
+#define GET_INFO(x) DLD(__TBB_call_with_my_server_info,x)
+#define SERVER tbb_server
+#define CLIENT tbb_client
+#define FACTORY tbb_factory
+
+#if __TBB_WEAK_SYMBOLS_PRESENT
+    #pragma weak __TBB_make_rml_server
+    #pragma weak __TBB_call_with_my_server_info
+    extern "C" {
+        ::rml::factory::status_type __TBB_make_rml_server( rml::tbb_factory& f, rml::tbb_server*& server, rml::tbb_client& client );
+        void __TBB_call_with_my_server_info( ::rml::server_info_callback_t cb, void* arg );
+    }
+#endif /* __TBB_WEAK_SYMBOLS_PRESENT */
+
+#if TBB_USE_DEBUG
+#define DEBUG_SUFFIX "_debug"
+#else
+#define DEBUG_SUFFIX
+#endif /* TBB_USE_DEBUG */
+
+// RML_SERVER_NAME is the name of the RML server library.
+#if _WIN32 || _WIN64
+#define RML_SERVER_NAME "irml" DEBUG_SUFFIX ".dll"
+#elif __APPLE__
+#define RML_SERVER_NAME "libirml" DEBUG_SUFFIX ".dylib"
+#elif __linux__
+#define RML_SERVER_NAME "libirml" DEBUG_SUFFIX ".so.1"
+#elif __FreeBSD__ || __NetBSD__ || __OpenBSD__ || __sun || _AIX
+#define RML_SERVER_NAME "libirml" DEBUG_SUFFIX ".so"
+#else
+#error Unknown OS
+#endif
+
+const ::rml::versioned_object::version_type CLIENT_VERSION = 2;
+
+#if __TBB_WEAK_SYMBOLS_PRESENT
+    #pragma weak __RML_open_factory
+    #pragma weak __RML_close_factory
+    extern "C" {
+        ::rml::factory::status_type __RML_open_factory ( ::rml::factory&, ::rml::versioned_object::version_type&, ::rml::versioned_object::version_type );
+        void __RML_close_factory( ::rml::factory& f );
+    }
+#endif /* __TBB_WEAK_SYMBOLS_PRESENT */
+
+::rml::factory::status_type FACTORY::open() {
+    // Failure of following assertion indicates that factory is already open, or not zero-inited.
+    __TBB_ASSERT_EX( !library_handle, NULL );
+    status_type (*open_factory_routine)( factory&, version_type&, version_type );
+    dynamic_link_descriptor server_link_table[4] = {
+        DLD(__RML_open_factory,open_factory_routine),
+        MAKE_SERVER(my_make_server_routine),
+        DLD(__RML_close_factory,my_wait_to_close_routine),
+        GET_INFO(my_call_with_server_info_routine),
+    };
+    status_type result;
+    if ( dynamic_link( RML_SERVER_NAME, server_link_table, 4, &library_handle ) ) {
+        version_type server_version;
+        result = (*open_factory_routine)( *this, server_version, CLIENT_VERSION );
+        // server_version can be checked here for incompatibility if necessary.
+    } else {
+        library_handle = NULL;
+        result = st_not_found;
+    }
+    return result;
+}
+
+void FACTORY::close() {
+    if ( library_handle )
+        (*my_wait_to_close_routine)(*this);
+    if ( (size_t)library_handle>FACTORY::c_dont_unload ) {
+        dynamic_unlink(library_handle);
+        library_handle = NULL;
+    }
+}
+
+::rml::factory::status_type FACTORY::make_server( SERVER*& s, CLIENT& c) {
+    // Failure of following assertion means that factory was not successfully opened.
+    __TBB_ASSERT_EX( my_make_server_routine, NULL );
+    return (*my_make_server_routine)(*this,s,c);
+}
+
+} // namespace rml
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
diff --git a/contrib/libs/tbb/src/tbb/rml_tbb.h b/contrib/libs/tbb/src/tbb/rml_tbb.h
new file mode 100644
index 0000000000..de923be1b2
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/rml_tbb.h
@@ -0,0 +1,94 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+// Header guard and namespace names follow TBB conventions.
+
+#ifndef __TBB_rml_tbb_H
+#define __TBB_rml_tbb_H
+
+#include "oneapi/tbb/version.h"
+#include "rml_base.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+namespace rml {
+
+//------------------------------------------------------------------------
+// Classes instantiated by the server
+//------------------------------------------------------------------------
+
+//! Represents a set of oneTBB worker threads provided by the server.
+class tbb_server: public ::rml::server {
+public:
+    //! Inform server of adjustments in the number of workers that the client can profitably use.
+    virtual void adjust_job_count_estimate( int delta ) = 0;
+
+#if _WIN32||_WIN64
+    //! Inform server of a oneTBB external thread.
+    virtual void register_external_thread( execution_resource_t& v ) = 0;
+
+    //! Inform server that the oneTBB external thread is done with its work.
+    virtual void unregister_external_thread( execution_resource_t v ) = 0;
+#endif /* _WIN32||_WIN64 */
+};
+
+//------------------------------------------------------------------------
+// Classes instantiated by the client
+//------------------------------------------------------------------------
+
+class tbb_client: public ::rml::client {
+public:
+    //! Defined by TBB to steal a task and execute it.  
+    /** Called by server when it wants an execution context to do some TBB work.
+        The method should return when it is okay for the thread to yield indefinitely. */
+    virtual void process( job& ) RML_PURE(void)
+};
+
+/** Client must ensure that instance is zero-inited, typically by being a file-scope object. */
+class tbb_factory: public ::rml::factory {
+
+    //! Pointer to routine that creates an RML server.
+    status_type (*my_make_server_routine)( tbb_factory&, tbb_server*&, tbb_client& );
+
+    //! Pointer to routine that calls callback function with server version info.
+    void (*my_call_with_server_info_routine)( ::rml::server_info_callback_t cb, void* arg );
+
+public:
+    typedef ::rml::versioned_object::version_type version_type;
+    typedef tbb_client client_type;
+    typedef tbb_server server_type;
+
+    //! Open factory.
+    /** Dynamically links against RML library. 
+        Returns st_success, st_incompatible, or st_not_found. */
+    status_type open();
+
+    //! Factory method to be called by client to create a server object.
+    /** Factory must be open. 
+        Returns st_success, or st_incompatible . */
+    status_type make_server( server_type*&, client_type& );
+
+    //! Close factory
+    void close();
+};
+
+} // namespace rml
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /*__TBB_rml_tbb_H */
diff --git a/contrib/libs/tbb/src/tbb/rml_thread_monitor.h b/contrib/libs/tbb/src/tbb/rml_thread_monitor.h
new file mode 100644
index 0000000000..613ec72e98
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/rml_thread_monitor.h
@@ -0,0 +1,258 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+// All platform-specific threading support is encapsulated here. */
+
+#ifndef __RML_thread_monitor_H
+#define __RML_thread_monitor_H
+
+#if __TBB_USE_WINAPI
+#include <windows.h>
+#include <process.h>
+#include <malloc.h> //_alloca
+#include "misc.h" // support for processor groups
+#if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00)
+#include <thread>
+#endif
+#elif __TBB_USE_POSIX
+#include <pthread.h>
+#include <cstring>
+#include <cstdlib>
+#else
+#error Unsupported platform
+#endif
+#include <cstdio>
+
+#include "oneapi/tbb/detail/_template_helpers.h"
+
+#include "itt_notify.h"
+#include "semaphore.h"
+
+// All platform-specific threading support is in this header.
+
+#if (_WIN32||_WIN64)&&!__TBB_ipf
+// Deal with 64K aliasing.  The formula for "offset" is a Fibonacci hash function,
+// which has the desirable feature of spreading out the offsets fairly evenly
+// without knowing the total number of offsets, and furthermore unlikely to
+// accidentally cancel out other 64K aliasing schemes that Microsoft might implement later.
+// See Knuth Vol 3. "Theorem S" for details on Fibonacci hashing.
+// The second statement is really does need "volatile", otherwise the compiler might remove the _alloca.
+#define AVOID_64K_ALIASING(idx)                       \
+    std::size_t offset = (idx+1) * 40503U % (1U<<16);      \
+    void* volatile sink_for_alloca = _alloca(offset); \
+    __TBB_ASSERT_EX(sink_for_alloca, "_alloca failed");
+#else
+// Linux thread allocators avoid 64K aliasing.
+#define AVOID_64K_ALIASING(idx) tbb::detail::suppress_unused_warning(idx)
+#endif /* _WIN32||_WIN64 */
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+// Forward declaration: throws std::runtime_error with what() returning error_code description prefixed with aux_info
+void handle_perror(int error_code, const char* aux_info);
+
+namespace rml {
+namespace internal {
+
+#if __TBB_USE_ITT_NOTIFY
+static const ::tbb::detail::r1::tchar *SyncType_RML = _T("%Constant");
+static const ::tbb::detail::r1::tchar *SyncObj_ThreadMonitor = _T("RML Thr Monitor");
+#endif /* __TBB_USE_ITT_NOTIFY */
+
+//! Monitor with limited two-phase commit form of wait.
+/** At most one thread should wait on an instance at a time. */
+class thread_monitor {
+public:
+    class cookie {
+        friend class thread_monitor;
+        std::atomic<std::size_t> my_epoch{0};
+    };
+    thread_monitor() : skipped_wakeup(false), my_sema() {
+        ITT_SYNC_CREATE(&my_sema, SyncType_RML, SyncObj_ThreadMonitor);
+    }
+    ~thread_monitor() {}
+
+    //! If a thread is waiting or started a two-phase wait, notify it.
+    /** Can be called by any thread. */
+    void notify();
+
+    //! Begin two-phase wait.
+    /** Should only be called by thread that owns the monitor.
+        The caller must either complete the wait or cancel it. */
+    void prepare_wait( cookie& c );
+
+    //! Complete a two-phase wait and wait until notification occurs after the earlier prepare_wait.
+    void commit_wait( cookie& c );
+
+    //! Cancel a two-phase wait.
+    void cancel_wait();
+
+#if __TBB_USE_WINAPI
+    typedef HANDLE handle_type;
+
+    #define __RML_DECL_THREAD_ROUTINE unsigned WINAPI
+    typedef unsigned (WINAPI *thread_routine_type)(void*);
+
+    //! Launch a thread
+    static handle_type launch( thread_routine_type thread_routine, void* arg, std::size_t stack_size, const size_t* worker_index = NULL );
+
+#elif __TBB_USE_POSIX
+    typedef pthread_t handle_type;
+
+    #define __RML_DECL_THREAD_ROUTINE void*
+    typedef void*(*thread_routine_type)(void*);
+
+    //! Launch a thread
+    static handle_type launch( thread_routine_type thread_routine, void* arg, std::size_t stack_size );
+#endif /* __TBB_USE_POSIX */
+
+    //! Join thread
+    static void join(handle_type handle);
+
+    //! Detach thread
+    static void detach_thread(handle_type handle);
+private:
+    cookie my_cookie; // epoch counter
+    std::atomic<bool> in_wait{false};
+    bool skipped_wakeup;
+    binary_semaphore my_sema;
+#if __TBB_USE_POSIX
+    static void check( int error_code, const char* routine );
+#endif
+};
+
+#if __TBB_USE_WINAPI
+
+#ifndef STACK_SIZE_PARAM_IS_A_RESERVATION
+#define STACK_SIZE_PARAM_IS_A_RESERVATION 0x00010000
+#endif
+
+// _beginthreadex API is not available in Windows 8 Store* applications, so use std::thread instead
+#if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00)
+inline thread_monitor::handle_type thread_monitor::launch( thread_routine_type thread_function, void* arg, std::size_t, const std::size_t*) {
+//TODO: check that exception thrown from std::thread is not swallowed silently
+    std::thread* thread_tmp=new std::thread(thread_function, arg);
+    return thread_tmp->native_handle();
+}
+#else
+inline thread_monitor::handle_type thread_monitor::launch( thread_routine_type thread_routine, void* arg, std::size_t stack_size, const std::size_t* worker_index ) {
+    unsigned thread_id;
+    int number_of_processor_groups = ( worker_index ) ? NumberOfProcessorGroups() : 0;
+    unsigned create_flags = ( number_of_processor_groups > 1 ) ? CREATE_SUSPENDED : 0;
+    HANDLE h = (HANDLE)_beginthreadex( NULL, unsigned(stack_size), thread_routine, arg, STACK_SIZE_PARAM_IS_A_RESERVATION | create_flags, &thread_id );
+    if( !h ) {
+        handle_perror(0, "thread_monitor::launch: _beginthreadex failed\n");
+    }
+    if ( number_of_processor_groups > 1 ) {
+        MoveThreadIntoProcessorGroup( h, FindProcessorGroupIndex( static_cast<int>(*worker_index) ) );
+        ResumeThread( h );
+    }
+    return h;
+}
+#endif //__TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00)
+
+void thread_monitor::join(handle_type handle) {
+#if TBB_USE_ASSERT
+    DWORD res =
+#endif
+        WaitForSingleObjectEx(handle, INFINITE, FALSE);
+    __TBB_ASSERT( res==WAIT_OBJECT_0, NULL );
+#if TBB_USE_ASSERT
+    BOOL val =
+#endif
+        CloseHandle(handle);
+    __TBB_ASSERT( val, NULL );
+}
+
+void thread_monitor::detach_thread(handle_type handle) {
+#if TBB_USE_ASSERT
+    BOOL val =
+#endif
+        CloseHandle(handle);
+    __TBB_ASSERT( val, NULL );
+}
+
+#endif /* __TBB_USE_WINAPI */
+
+#if __TBB_USE_POSIX
+inline void thread_monitor::check( int error_code, const char* routine ) {
+    if( error_code ) {
+        handle_perror(error_code, routine);
+    }
+}
+
+inline thread_monitor::handle_type thread_monitor::launch( void* (*thread_routine)(void*), void* arg, std::size_t stack_size ) {
+    // FIXME - consider more graceful recovery than just exiting if a thread cannot be launched.
+    // Note that there are some tricky situations to deal with, such that the thread is already
+    // grabbed as part of an OpenMP team.
+    pthread_attr_t s;
+    check(pthread_attr_init( &s ), "pthread_attr_init has failed");
+    if( stack_size>0 )
+        check(pthread_attr_setstacksize( &s, stack_size ), "pthread_attr_setstack_size has failed" );
+    pthread_t handle;
+    check( pthread_create( &handle, &s, thread_routine, arg ), "pthread_create has failed" );
+    check( pthread_attr_destroy( &s ), "pthread_attr_destroy has failed" );
+    return handle;
+}
+
+void thread_monitor::join(handle_type handle) {
+    check(pthread_join(handle, NULL), "pthread_join has failed");
+}
+
+void thread_monitor::detach_thread(handle_type handle) {
+    check(pthread_detach(handle), "pthread_detach has failed");
+}
+#endif /* __TBB_USE_POSIX */
+
+inline void thread_monitor::notify() {
+    my_cookie.my_epoch.store(my_cookie.my_epoch.load(std::memory_order_acquire) + 1, std::memory_order_release);
+    bool do_signal = in_wait.exchange( false );
+    if( do_signal )
+        my_sema.V();
+}
+
+inline void thread_monitor::prepare_wait( cookie& c ) {
+    if( skipped_wakeup ) {
+        // Lazily consume a signal that was skipped due to cancel_wait
+        skipped_wakeup = false;
+        my_sema.P(); // does not really wait on the semaphore
+    }
+    // Former c = my_cookie
+    c.my_epoch.store(my_cookie.my_epoch.load(std::memory_order_acquire), std::memory_order_release);
+    in_wait.store( true, std::memory_order_seq_cst );
+}
+
+inline void thread_monitor::commit_wait( cookie& c ) {
+    bool do_it = ( c.my_epoch.load(std::memory_order_relaxed) == my_cookie.my_epoch.load(std::memory_order_relaxed) );
+    if( do_it ) my_sema.P();
+    else        cancel_wait();
+}
+
+inline void thread_monitor::cancel_wait() {
+    // if not in_wait, then some thread has sent us a signal;
+    // it will be consumed by the next prepare_wait call
+    skipped_wakeup = ! in_wait.exchange( false );
+}
+
+} // namespace internal
+} // namespace rml
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __RML_thread_monitor_H */
diff --git a/contrib/libs/tbb/src/tbb/rtm_mutex.cpp b/contrib/libs/tbb/src/tbb/rtm_mutex.cpp
new file mode 100644
index 0000000000..fe7fb66dc8
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/rtm_mutex.cpp
@@ -0,0 +1,120 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "oneapi/tbb/detail/_assert.h"
+#include "oneapi/tbb/detail/_rtm_mutex.h"
+#include "itt_notify.h"
+#include "governor.h"
+#include "misc.h"
+
+#include <atomic>
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+// maximum number of times to retry
+// TODO: experiment on retry values.
+static constexpr int retry_threshold = 10;
+
+struct rtm_mutex_impl {
+    //! Release speculative mutex
+    static void release(d1::rtm_mutex::scoped_lock& s) {
+        switch(s.m_transaction_state) {
+        case d1::rtm_mutex::rtm_state::rtm_transacting:
+            __TBB_ASSERT(is_in_transaction(), "m_transaction_state && not speculating");
+            end_transaction();
+            s.m_mutex = nullptr;
+            break;
+        case d1::rtm_mutex::rtm_state::rtm_real:
+            s.m_mutex->unlock();
+            s.m_mutex = nullptr;
+            break;
+        case d1::rtm_mutex::rtm_state::rtm_none:
+            __TBB_ASSERT(false, "mutex is not locked, but in release");
+            break;
+        default:
+            __TBB_ASSERT(false, "invalid m_transaction_state");
+        }
+        s.m_transaction_state = d1::rtm_mutex::rtm_state::rtm_none;
+    }
+
+    //! Acquire lock on the given mutex.
+    static void acquire(d1::rtm_mutex& m, d1::rtm_mutex::scoped_lock& s, bool only_speculate) {
+        __TBB_ASSERT(s.m_transaction_state == d1::rtm_mutex::rtm_state::rtm_none, "scoped_lock already in transaction");
+        if(governor::speculation_enabled()) {
+            int num_retries = 0;
+            unsigned int abort_code = 0;
+            do {
+                if(m.m_flag.load(std::memory_order_acquire)) {
+                    if(only_speculate) return;
+                    spin_wait_while_eq(m.m_flag, true);
+                }
+                // _xbegin returns -1 on success or the abort code, so capture it
+                if((abort_code = begin_transaction()) == speculation_successful_begin)
+                {
+                    // started speculation
+                    if(m.m_flag.load(std::memory_order_relaxed)) {
+                        abort_transaction();
+                    }
+                    s.m_transaction_state = d1::rtm_mutex::rtm_state::rtm_transacting;
+                    // Don not wrap the following assignment to a function,
+                    // because it can abort the transaction in debug. Need mutex for release().
+                    s.m_mutex = &m;
+                    return;  // successfully started speculation
+                }
+                ++num_retries;
+            } while((abort_code & speculation_retry) != 0 && (num_retries < retry_threshold));
+        }
+
+        if(only_speculate) return;
+        s.m_mutex = &m;
+        s.m_mutex->lock();
+        s.m_transaction_state = d1::rtm_mutex::rtm_state::rtm_real;
+        return;
+    }
+
+    //! Try to acquire lock on the given mutex.
+    static bool try_acquire(d1::rtm_mutex& m, d1::rtm_mutex::scoped_lock& s) {
+        acquire(m, s, /*only_speculate=*/true);
+        if (s.m_transaction_state == d1::rtm_mutex::rtm_state::rtm_transacting) {
+            return true;
+        }
+        __TBB_ASSERT(s.m_transaction_state == d1::rtm_mutex::rtm_state::rtm_none, NULL);
+        // transacting acquire failed. try_lock the real mutex
+        if (m.try_lock()) {
+            s.m_mutex = &m;
+            s.m_transaction_state = d1::rtm_mutex::rtm_state::rtm_real;
+            return true;
+        }
+        return false;
+    }
+};
+
+void __TBB_EXPORTED_FUNC acquire(d1::rtm_mutex& m, d1::rtm_mutex::scoped_lock& s, bool only_speculate) {
+    rtm_mutex_impl::acquire(m, s, only_speculate);
+}
+bool __TBB_EXPORTED_FUNC try_acquire(d1::rtm_mutex& m, d1::rtm_mutex::scoped_lock& s) {
+    return rtm_mutex_impl::try_acquire(m, s);
+}
+void __TBB_EXPORTED_FUNC release(d1::rtm_mutex::scoped_lock& s) {
+    rtm_mutex_impl::release(s);
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
diff --git a/contrib/libs/tbb/src/tbb/rtm_rw_mutex.cpp b/contrib/libs/tbb/src/tbb/rtm_rw_mutex.cpp
new file mode 100644
index 0000000000..5e50de4c39
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/rtm_rw_mutex.cpp
@@ -0,0 +1,271 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "oneapi/tbb/detail/_assert.h"
+#include "oneapi/tbb/detail/_rtm_rw_mutex.h"
+#include "itt_notify.h"
+#include "governor.h"
+#include "misc.h"
+
+#include <atomic>
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+struct rtm_rw_mutex_impl {
+    // maximum number of times to retry
+    // TODO: experiment on retry values.
+    static constexpr int retry_threshold_read = 10;
+    static constexpr int retry_threshold_write = 10;
+
+    //! Release speculative mutex
+    static void release(d1::rtm_rw_mutex::scoped_lock& s) {
+        switch(s.m_transaction_state) {
+        case d1::rtm_rw_mutex::rtm_type::rtm_transacting_writer:
+        case d1::rtm_rw_mutex::rtm_type::rtm_transacting_reader:
+            __TBB_ASSERT(is_in_transaction(), "m_transaction_state && not speculating");
+            end_transaction();
+            s.m_mutex = nullptr;
+            break;
+        case d1::rtm_rw_mutex::rtm_type::rtm_real_reader:
+            __TBB_ASSERT(!s.m_mutex->write_flag.load(std::memory_order_relaxed), "write_flag set but read lock acquired");
+            s.m_mutex->unlock_shared();
+            s.m_mutex = nullptr;
+            break;
+        case d1::rtm_rw_mutex::rtm_type::rtm_real_writer:
+            __TBB_ASSERT(s.m_mutex->write_flag.load(std::memory_order_relaxed), "write_flag unset but write lock acquired");
+            s.m_mutex->write_flag.store(false, std::memory_order_relaxed);
+            s.m_mutex->unlock();
+            s.m_mutex = nullptr;
+            break;
+        case d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex:
+            __TBB_ASSERT(false, "rtm_not_in_mutex, but in release");
+            break;
+        default:
+            __TBB_ASSERT(false, "invalid m_transaction_state");
+        }
+        s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex;
+    }
+
+    //! Acquire write lock on the given mutex.
+    static void acquire_writer(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s, bool only_speculate) {
+        __TBB_ASSERT(s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex, "scoped_lock already in transaction");
+        if(governor::speculation_enabled()) {
+            int num_retries = 0;
+            unsigned int abort_code = 0;
+            do {
+                if(m.m_state.load(std::memory_order_acquire)) {
+                    if(only_speculate) return;
+                    spin_wait_until_eq(m.m_state, d1::rtm_rw_mutex::state_type(0));
+                }
+                // _xbegin returns -1 on success or the abort code, so capture it
+                if((abort_code = begin_transaction()) == speculation_successful_begin)
+                {
+                    // started speculation
+                    if(m.m_state.load(std::memory_order_relaxed)) {  // add spin_rw_mutex to read-set.
+                        // reader or writer grabbed the lock, so abort.
+                        abort_transaction();
+                    }
+                    s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_transacting_writer;
+                    // Don not wrap the following assignment to a function,
+                    // because it can abort the transaction in debug. Need mutex for release().
+                    s.m_mutex = &m;
+                    return;  // successfully started speculation
+                }
+                ++num_retries;
+            } while((abort_code & speculation_retry) != 0 && (num_retries < retry_threshold_write));
+        }
+
+        if(only_speculate) return;
+        s.m_mutex = &m;                                                          // should apply a real try_lock...
+        s.m_mutex->lock();                                                       // kill transactional writers
+        __TBB_ASSERT(!m.write_flag.load(std::memory_order_relaxed), "After acquire for write, write_flag already true");
+        m.write_flag.store(true, std::memory_order_relaxed);                       // kill transactional readers
+        s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_writer;
+        return;
+    }
+
+    //! Acquire read lock on given mutex.
+    //  only_speculate : true if we are doing a try_acquire.  If true and we fail to speculate, don't
+    //     really acquire the lock, return and do a try_acquire on the contained spin_rw_mutex.  If
+    //     the lock is already held by a writer, just return.
+    static void acquire_reader(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s, bool only_speculate) {
+        __TBB_ASSERT(s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex, "scoped_lock already in transaction");
+        if(governor::speculation_enabled()) {
+            int num_retries = 0;
+            unsigned int abort_code = 0;
+            do {
+                // if in try_acquire, and lock is held as writer, don't attempt to speculate.
+                if(m.write_flag.load(std::memory_order_acquire)) {
+                    if(only_speculate) return;
+                    spin_wait_while_eq(m.write_flag, true);
+                }
+                // _xbegin returns -1 on success or the abort code, so capture it
+                if((abort_code = begin_transaction()) == speculation_successful_begin)
+                {
+                    // started speculation
+                    if(m.write_flag.load(std::memory_order_relaxed)) {  // add write_flag to read-set.
+                        abort_transaction();  // writer grabbed the lock, so abort.
+                    }
+                    s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_transacting_reader;
+                    // Don not wrap the following assignment to a function,
+                    // because it can abort the transaction in debug. Need mutex for release().
+                    s.m_mutex = &m;
+                    return;  // successfully started speculation
+                }
+                // fallback path
+                // retry only if there is any hope of getting into a transaction soon
+                // Retry in the following cases (from Section 8.3.5 of
+                // Intel(R) Architecture Instruction Set Extensions Programming Reference):
+                // 1. abort caused by XABORT instruction (bit 0 of EAX register is set)
+                // 2. the transaction may succeed on a retry (bit 1 of EAX register is set)
+                // 3. if another logical processor conflicted with a memory address
+                //    that was part of the transaction that aborted (bit 2 of EAX register is set)
+                // That is, retry if (abort_code & 0x7) is non-zero
+                ++num_retries;
+            } while((abort_code & speculation_retry) != 0 && (num_retries < retry_threshold_read));
+        }
+
+        if(only_speculate) return;
+        s.m_mutex = &m;
+        s.m_mutex->lock_shared();
+        s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_reader;
+    }
+
+    //! Upgrade reader to become a writer.
+    /** Returns whether the upgrade happened without releasing and re-acquiring the lock */
+    static bool upgrade(d1::rtm_rw_mutex::scoped_lock& s) {
+        switch(s.m_transaction_state) {
+        case d1::rtm_rw_mutex::rtm_type::rtm_real_reader: {
+            s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_writer;
+            bool no_release = s.m_mutex->upgrade();
+            __TBB_ASSERT(!s.m_mutex->write_flag.load(std::memory_order_relaxed), "After upgrade, write_flag already true");
+            s.m_mutex->write_flag.store(true, std::memory_order_relaxed);
+            return no_release;
+        }
+        case d1::rtm_rw_mutex::rtm_type::rtm_transacting_reader: {
+            d1::rtm_rw_mutex& m = *s.m_mutex;
+            if(m.m_state.load(std::memory_order_acquire)) {  // add spin_rw_mutex to read-set.
+                // Real reader or writer holds the lock; so commit the read and re-acquire for write.
+                release(s);
+                acquire_writer(m, s, false);
+                return false;
+            } else
+            {
+                s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_transacting_writer;
+                return true;
+            }
+        }
+        default:
+            __TBB_ASSERT(false, "Invalid state for upgrade");
+            return false;
+        }
+    }
+
+    //! Downgrade writer to a reader.
+    static bool downgrade(d1::rtm_rw_mutex::scoped_lock& s) {
+        switch (s.m_transaction_state) {
+        case d1::rtm_rw_mutex::rtm_type::rtm_real_writer:
+            s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_reader;
+            __TBB_ASSERT(s.m_mutex->write_flag.load(std::memory_order_relaxed), "Before downgrade write_flag not true");
+            s.m_mutex->write_flag.store(false, std::memory_order_relaxed);
+            s.m_mutex->downgrade();
+            return true;
+        case d1::rtm_rw_mutex::rtm_type::rtm_transacting_writer:
+            s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_transacting_reader;
+            return true;
+        default:
+            __TBB_ASSERT(false, "Invalid state for downgrade");
+            return false;
+        }
+    }
+
+    //! Try to acquire write lock on the given mutex.
+    //  There may be reader(s) which acquired the spin_rw_mutex, as well as possibly
+    //  transactional reader(s).  If this is the case, the acquire will fail, and assigning
+    //  write_flag will kill the transactors.  So we only assign write_flag if we have successfully
+    //  acquired the lock.
+    static bool try_acquire_writer(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s) {
+        acquire_writer(m, s, /*only_speculate=*/true);
+        if (s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_transacting_writer) {
+            return true;
+        }
+        __TBB_ASSERT(s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex, NULL);
+        // transacting write acquire failed. try_lock the real mutex
+        if (m.try_lock()) {
+            s.m_mutex = &m;
+            // only shoot down readers if we're not transacting ourselves
+            __TBB_ASSERT(!m.write_flag.load(std::memory_order_relaxed), "After try_acquire_writer, write_flag already true");
+            m.write_flag.store(true, std::memory_order_relaxed);
+            s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_writer;
+            return true;
+        }
+        return false;
+    }
+
+    //! Try to acquire read lock on the given mutex.
+    static bool try_acquire_reader(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s) {
+        // speculatively acquire the lock. If this fails, do try_lock_shared on the spin_rw_mutex.
+        acquire_reader(m, s, /*only_speculate=*/true);
+        if (s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_transacting_reader) {
+            return true;
+        }
+        __TBB_ASSERT(s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex, NULL);
+        // transacting read acquire failed. try_lock_shared the real mutex
+        if (m.try_lock_shared()) {
+            s.m_mutex = &m;
+            s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_reader;
+            return true;
+        }
+        return false;
+    }
+};
+
+void __TBB_EXPORTED_FUNC acquire_writer(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s, bool only_speculate) {
+    rtm_rw_mutex_impl::acquire_writer(m, s, only_speculate);
+}
+//! Internal acquire read lock.
+// only_speculate == true if we're doing a try_lock, else false.
+void __TBB_EXPORTED_FUNC acquire_reader(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s, bool only_speculate) {
+    rtm_rw_mutex_impl::acquire_reader(m, s, only_speculate);
+}
+//! Internal upgrade reader to become a writer.
+bool __TBB_EXPORTED_FUNC upgrade(d1::rtm_rw_mutex::scoped_lock& s) {
+    return rtm_rw_mutex_impl::upgrade(s);
+}
+//! Internal downgrade writer to become a reader.
+bool __TBB_EXPORTED_FUNC downgrade(d1::rtm_rw_mutex::scoped_lock& s) {
+    return rtm_rw_mutex_impl::downgrade(s);
+}
+//! Internal try_acquire write lock.
+bool __TBB_EXPORTED_FUNC try_acquire_writer(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s) {
+    return rtm_rw_mutex_impl::try_acquire_writer(m, s);
+}
+//! Internal try_acquire read lock.
+bool __TBB_EXPORTED_FUNC try_acquire_reader(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s) {
+    return rtm_rw_mutex_impl::try_acquire_reader(m, s);
+}
+//! Internal release lock.
+void __TBB_EXPORTED_FUNC release(d1::rtm_rw_mutex::scoped_lock& s) {
+    rtm_rw_mutex_impl::release(s);
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+
diff --git a/contrib/libs/tbb/src/tbb/scheduler_common.h b/contrib/libs/tbb/src/tbb/scheduler_common.h
new file mode 100644
index 0000000000..ee13dbf981
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/scheduler_common.h
@@ -0,0 +1,505 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_scheduler_common_H
+#define _TBB_scheduler_common_H
+
+#include "oneapi/tbb/detail/_utils.h"
+#include "oneapi/tbb/detail/_template_helpers.h"
+#include "oneapi/tbb/detail/_task.h"
+#include "oneapi/tbb/detail/_machine.h"
+#include "oneapi/tbb/task_group.h"
+#include "oneapi/tbb/cache_aligned_allocator.h"
+#include "itt_notify.h"
+#include "co_context.h"
+#include "misc.h"
+#include "governor.h"
+
+#ifndef __TBB_SCHEDULER_MUTEX_TYPE
+#define __TBB_SCHEDULER_MUTEX_TYPE tbb::spin_mutex
+#endif
+// TODO: add conditional inclusion based on specified type
+#include "oneapi/tbb/spin_mutex.h"
+
+#if TBB_USE_ASSERT
+#include <atomic>
+#endif
+
+#include <cstdint>
+#include <exception>
+
+//! Mutex type for global locks in the scheduler
+using scheduler_mutex_type = __TBB_SCHEDULER_MUTEX_TYPE;
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Workaround for overzealous compiler warnings
+    // These particular warnings are so ubiquitous that no attempt is made to narrow
+    // the scope of the warnings.
+    #pragma warning (disable: 4100 4127 4312 4244 4267 4706)
+#endif
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class arena;
+class mail_inbox;
+class mail_outbox;
+class market;
+class observer_proxy;
+
+enum task_stream_accessor_type { front_accessor = 0, back_nonnull_accessor };
+template<task_stream_accessor_type> class task_stream;
+
+using isolation_type = std::intptr_t;
+constexpr isolation_type no_isolation = 0;
+
+//------------------------------------------------------------------------
+// Extended execute data
+//------------------------------------------------------------------------
+
+//! Execute data used on a task dispatcher side, reflects a current execution state
+struct execution_data_ext : d1::execution_data {
+    task_dispatcher* task_disp{};
+    isolation_type isolation{};
+    d1::wait_context* wait_ctx{};
+};
+
+//------------------------------------------------------------------------
+// Task accessor
+//------------------------------------------------------------------------
+
+//! Interpretation of reserved task fields inside a task dispatcher
+struct task_accessor {
+    static constexpr std::uint64_t proxy_task_trait = 1;
+    static constexpr std::uint64_t resume_task_trait = 2;
+    static d1::task_group_context*& context(d1::task& t) {
+        task_group_context** tgc = reinterpret_cast<task_group_context**>(&t.m_reserved[0]);
+        return *tgc;
+    }
+    static isolation_type& isolation(d1::task& t) {
+        isolation_type* tag = reinterpret_cast<isolation_type*>(&t.m_reserved[2]);
+        return *tag;
+    }
+    static void set_proxy_trait(d1::task& t) {
+        // TODO: refactor proxy tasks not to work on uninitialized memory.
+        //__TBB_ASSERT((t.m_version_and_traits & proxy_task_trait) == 0, nullptr);
+        t.m_version_and_traits |= proxy_task_trait;
+    }
+    static bool is_proxy_task(d1::task& t) {
+        return (t.m_version_and_traits & proxy_task_trait) != 0;
+    }
+    static void set_resume_trait(d1::task& t) {
+        __TBB_ASSERT((t.m_version_and_traits & resume_task_trait) == 0, nullptr);
+        t.m_version_and_traits |= resume_task_trait;
+    }
+    static bool is_resume_task(d1::task& t) {
+        return (t.m_version_and_traits & resume_task_trait) != 0;
+    }
+};
+
+//------------------------------------------------------------------------
+//! Extended variant of the standard offsetof macro
+/** The standard offsetof macro is not sufficient for TBB as it can be used for
+    POD-types only. The constant 0x1000 (not NULL) is necessary to appease GCC. **/
+#define __TBB_offsetof(class_name, member_name) \
+    ((ptrdiff_t)&(reinterpret_cast<class_name*>(0x1000)->member_name) - 0x1000)
+
+//! Returns address of the object containing a member with the given name and address
+#define __TBB_get_object_ref(class_name, member_name, member_addr) \
+    (*reinterpret_cast<class_name*>((char*)member_addr - __TBB_offsetof(class_name, member_name)))
+
+//! Helper class for tracking floating point context and task group context switches
+/** Assuming presence of an itt collector, in addition to keeping track of floating
+    point context, this class emits itt events to indicate begin and end of task group
+    context execution **/
+template <bool report_tasks>
+class context_guard_helper {
+    const d1::task_group_context* curr_ctx;
+    d1::cpu_ctl_env guard_cpu_ctl_env;
+    d1::cpu_ctl_env curr_cpu_ctl_env;
+public:
+    context_guard_helper() : curr_ctx(NULL) {
+        guard_cpu_ctl_env.get_env();
+        curr_cpu_ctl_env = guard_cpu_ctl_env;
+    }
+    ~context_guard_helper() {
+        if (curr_cpu_ctl_env != guard_cpu_ctl_env)
+            guard_cpu_ctl_env.set_env();
+        if (report_tasks && curr_ctx)
+            ITT_TASK_END;
+    }
+    // The function is called from bypass dispatch loop on the hot path.
+    // Consider performance issues when refactoring.
+    void set_ctx(const d1::task_group_context* ctx) {
+        if (!ctx)
+            return;
+        const d1::cpu_ctl_env* ctl = reinterpret_cast<const d1::cpu_ctl_env*>(&ctx->my_cpu_ctl_env);
+        // Compare the FPU settings directly because the context can be reused between parallel algorithms.
+        if (*ctl != curr_cpu_ctl_env) {
+            curr_cpu_ctl_env = *ctl;
+            curr_cpu_ctl_env.set_env();
+        }
+        if (report_tasks && ctx != curr_ctx) {
+            // if task group context was active, report end of current execution frame.
+            if (curr_ctx)
+                ITT_TASK_END;
+            // reporting begin of new task group context execution frame.
+            // using address of task group context object to group tasks (parent).
+            // id of task execution frame is NULL and reserved for future use.
+            ITT_TASK_BEGIN(ctx, ctx->my_name, NULL);
+            curr_ctx = ctx;
+        }
+    }
+#if _WIN64
+    void restore_default() {
+        if (curr_cpu_ctl_env != guard_cpu_ctl_env) {
+            guard_cpu_ctl_env.set_env();
+            curr_cpu_ctl_env = guard_cpu_ctl_env;
+        }
+    }
+#endif // _WIN64
+};
+
+#if (_WIN32 || _WIN64 || __linux__) && (__TBB_x86_32 || __TBB_x86_64)
+#if _MSC_VER
+#pragma intrinsic(__rdtsc)
+#endif
+inline std::uint64_t machine_time_stamp() {
+#if __INTEL_COMPILER
+    return _rdtsc();
+#elif _MSC_VER
+    return __rdtsc();
+#else
+    std::uint32_t hi, lo;
+    __asm__ __volatile__("rdtsc" : "=d"(hi), "=a"(lo));
+    return (std::uint64_t(hi) << 32) | lo;
+#endif
+}
+
+inline void prolonged_pause_impl() {
+    // Assumption based on practice: 1000-2000 ticks seems to be a suitable invariant for the
+    // majority of platforms. Currently, skip platforms that define __TBB_STEALING_PAUSE
+    // because these platforms require very careful tuning.
+    std::uint64_t prev = machine_time_stamp();
+    const std::uint64_t finish = prev + 1000;
+    atomic_backoff backoff;
+    do {
+        backoff.bounded_pause();
+        std::uint64_t curr = machine_time_stamp();
+        if (curr <= prev)
+            // Possibly, the current logical thread is moved to another hardware thread or overflow is occurred.
+            break;
+        prev = curr;
+    } while (prev < finish);
+}
+#else
+inline void prolonged_pause_impl() {
+#ifdef __TBB_ipf
+    static const long PauseTime = 1500;
+#else
+    static const long PauseTime = 80;
+#endif
+    // TODO IDEA: Update PauseTime adaptively?
+    machine_pause(PauseTime);
+}
+#endif
+
+inline void prolonged_pause() {
+#if __TBB_WAITPKG_INTRINSICS_PRESENT && (_WIN32 || _WIN64 || __linux__) && (__TBB_x86_32 || __TBB_x86_64)
+    if (governor::wait_package_enabled()) {
+        std::uint64_t time_stamp = machine_time_stamp();
+        // _tpause function directs the processor to enter an implementation-dependent optimized state
+        // until the Time Stamp Counter reaches or exceeds the value specified in second parameter.
+        // Constant "700" is ticks to wait for.
+        // First parameter 0 selects between a lower power (cleared) or faster wakeup (set) optimized state.
+        _tpause(0, time_stamp + 700);
+    }
+    else
+#endif
+    prolonged_pause_impl();
+}
+
+class stealing_loop_backoff {
+    const int my_pause_threshold;
+    const int my_yield_threshold;
+    int my_pause_count;
+    int my_yield_count;
+public:
+    // my_yield_threshold = 100 is an experimental value. Ideally, once we start calling __TBB_Yield(),
+    // the time spent spinning before calling is_out_of_work() should be approximately
+    // the time it takes for a thread to be woken up. Doing so would guarantee that we do
+    // no worse than 2x the optimal spin time. Or perhaps a time-slice quantum is the right amount.
+    stealing_loop_backoff(int num_workers)
+        : my_pause_threshold{ 2 * (num_workers + 1) }
+#if __APPLE__
+        // threshold value tuned separately for macOS due to high cost of sched_yield there
+        , my_yield_threshold{10}
+#else
+        , my_yield_threshold{100}
+#endif
+        , my_pause_count{}
+        , my_yield_count{}
+    {}
+    bool pause() {
+        prolonged_pause();
+        if (my_pause_count++ >= my_pause_threshold) {
+            my_pause_count = my_pause_threshold;
+            d0::yield();
+            if (my_yield_count++ >= my_yield_threshold) {
+                my_yield_count = my_yield_threshold;
+                return true;
+            }
+        }
+        return false;
+    }
+    void reset_wait() {
+        my_pause_count = my_yield_count = 0;
+    }
+};
+
+//------------------------------------------------------------------------
+// Exception support
+//------------------------------------------------------------------------
+//! Task group state change propagation global epoch
+/** Together with generic_scheduler::my_context_state_propagation_epoch forms
+    cross-thread signaling mechanism that allows to avoid locking at the hot path
+    of normal execution flow.
+
+    When a descendant task group context is registered or unregistered, the global
+    and local epochs are compared. If they differ, a state change is being propagated,
+    and thus registration/deregistration routines take slower branch that may block
+    (at most one thread of the pool can be blocked at any moment). Otherwise the
+    control path is lock-free and fast. **/
+extern std::atomic<std::uintptr_t> the_context_state_propagation_epoch;
+
+//! Mutex guarding state change propagation across task groups forest.
+/** Also protects modification of related data structures. **/
+typedef scheduler_mutex_type context_state_propagation_mutex_type;
+extern context_state_propagation_mutex_type the_context_state_propagation_mutex;
+
+class tbb_exception_ptr {
+    std::exception_ptr my_ptr;
+public:
+    static tbb_exception_ptr* allocate() noexcept;
+
+    //! Destroys this objects
+    /** Note that objects of this type can be created only by the allocate() method. **/
+    void destroy() noexcept;
+
+    //! Throws the contained exception .
+    void throw_self();
+
+private:
+    tbb_exception_ptr(const std::exception_ptr& src) : my_ptr(src) {}
+}; // class tbb_exception_ptr
+
+//------------------------------------------------------------------------
+// Debugging support
+//------------------------------------------------------------------------
+
+#if TBB_USE_ASSERT
+static const std::uintptr_t venom = tbb::detail::select_size_t_constant<0xDEADBEEFU, 0xDDEEAADDDEADBEEFULL>::value;
+
+inline void poison_value(std::uintptr_t& val) { val = venom; }
+
+inline void poison_value(std::atomic<std::uintptr_t>& val) { val.store(venom, std::memory_order_relaxed); }
+
+/** Expected to be used in assertions only, thus no empty form is defined. **/
+inline bool is_alive(std::uintptr_t v) { return v != venom; }
+
+/** Logically, this method should be a member of class task.
+    But we do not want to publish it, so it is here instead. */
+inline void assert_task_valid(const d1::task* t) {
+    assert_pointer_valid(t);
+}
+#else /* !TBB_USE_ASSERT */
+
+/** In contrast to debug version poison_value() is a macro here because
+    the variable used as its argument may be undefined in release builds. **/
+#define poison_value(g) ((void)0)
+
+inline void assert_task_valid(const d1::task*) {}
+
+#endif /* !TBB_USE_ASSERT */
+
+struct suspend_point_type {
+#if __TBB_RESUMABLE_TASKS
+    //! The arena related to this task_dispatcher
+    arena* m_arena{ nullptr };
+    //! The random for the resume task
+    FastRandom m_random;
+    //! The flag is raised when the original owner should return to this task dispatcher.
+    std::atomic<bool> m_is_owner_recalled{ false };
+    //! Inicates if the resume task should be placed to the critical task stream.
+    bool m_is_critical{ false };
+    //! Associated coroutine
+    co_context m_co_context;
+
+    struct resume_task final : public d1::task {
+        task_dispatcher& m_target;
+        explicit resume_task(task_dispatcher& target) : m_target(target) {
+            task_accessor::set_resume_trait(*this);
+        }
+        d1::task* execute(d1::execution_data& ed) override;
+        d1::task* cancel(d1::execution_data&) override {
+            __TBB_ASSERT(false, "The resume task cannot be canceled");
+            return nullptr;
+        }
+    } m_resume_task;
+
+    suspend_point_type(arena* a, std::size_t stack_size, task_dispatcher& target);
+#endif /*__TBB_RESUMABLE_TASKS */
+};
+
+class alignas (max_nfs_size) task_dispatcher {
+public:
+    // TODO: reconsider low level design to better organize dependencies and files.
+    friend class thread_data;
+    friend class arena_slot;
+    friend class nested_arena_context;
+    friend class delegated_task;
+    friend struct base_waiter;
+
+    //! The data of the current thread attached to this task_dispatcher
+    thread_data* m_thread_data{ nullptr };
+
+    //! The current execution data
+    execution_data_ext m_execute_data_ext;
+
+    //! Properties
+    struct properties {
+        bool outermost{ true };
+        bool fifo_tasks_allowed{ true };
+        bool critical_task_allowed{ true };
+    } m_properties;
+
+    //! Position in the call stack when stealing is still allowed.
+    std::uintptr_t m_stealing_threshold{};
+
+    //! Suspend point (null if this task dispatcher has been never suspended)
+    suspend_point_type* m_suspend_point{ nullptr };
+
+    //! Attempt to get a task from the mailbox.
+    /** Gets a task only if it has not been executed by its sender or a thief
+        that has stolen it from the sender's task pool. Otherwise returns NULL.
+        This method is intended to be used only by the thread extracting the proxy
+        from its mailbox. (In contrast to local task pool, mailbox can be read only
+        by its owner). **/
+    d1::task* get_mailbox_task(mail_inbox& my_inbox, execution_data_ext& ed, isolation_type isolation);
+
+    d1::task* get_critical_task(d1::task*, execution_data_ext&, isolation_type, bool);
+
+    template <bool ITTPossible, typename Waiter>
+    d1::task* receive_or_steal_task(thread_data& tls, execution_data_ext& ed, Waiter& waiter,
+                                isolation_type isolation, bool outermost, bool criticality_absence);
+
+    template <bool ITTPossible, typename Waiter>
+    d1::task* local_wait_for_all(d1::task * t, Waiter& waiter);
+
+    task_dispatcher(const task_dispatcher&) = delete;
+
+    bool can_steal();
+public:
+    task_dispatcher(arena* a);
+
+    ~task_dispatcher() {
+        if (m_suspend_point) {
+            m_suspend_point->~suspend_point_type();
+            cache_aligned_deallocate(m_suspend_point);
+        }
+        poison_pointer(m_thread_data);
+        poison_pointer(m_suspend_point);
+    }
+
+    template <typename Waiter>
+    d1::task* local_wait_for_all(d1::task* t, Waiter& waiter);
+
+    bool allow_fifo_task(bool new_state) {
+        bool old_state = m_properties.fifo_tasks_allowed;
+        m_properties.fifo_tasks_allowed = new_state;
+        return old_state;
+    }
+
+    isolation_type set_isolation(isolation_type isolation) {
+        isolation_type prev = m_execute_data_ext.isolation;
+        m_execute_data_ext.isolation = isolation;
+        return prev;
+    }
+
+    thread_data& get_thread_data() {
+        __TBB_ASSERT(m_thread_data, nullptr);
+        return *m_thread_data;
+    }
+
+    static void execute_and_wait(d1::task* t, d1::wait_context& wait_ctx, d1::task_group_context& w_ctx);
+
+    void set_stealing_threshold(std::uintptr_t stealing_threshold) {
+        bool assert_condition = (stealing_threshold == 0 && m_stealing_threshold != 0) ||
+                                (stealing_threshold != 0 && m_stealing_threshold == 0);
+        __TBB_ASSERT_EX( assert_condition, nullptr );
+        m_stealing_threshold = stealing_threshold;
+    }
+
+    d1::task* get_inbox_or_critical_task(execution_data_ext&, mail_inbox&, isolation_type, bool);
+    d1::task* get_stream_or_critical_task(execution_data_ext&, arena&, task_stream<front_accessor>&,
+                                      unsigned& /*hint_for_stream*/, isolation_type,
+                                      bool /*critical_allowed*/);
+    d1::task* steal_or_get_critical(execution_data_ext&, arena&, unsigned /*arena_index*/, FastRandom&,
+                                isolation_type, bool /*critical_allowed*/);
+
+#if __TBB_RESUMABLE_TASKS
+    /* [[noreturn]] */ void co_local_wait_for_all() noexcept;
+    void suspend(suspend_callback_type suspend_callback, void* user_callback);
+    void resume(task_dispatcher& target);
+    suspend_point_type* get_suspend_point();
+    void init_suspend_point(arena* a, std::size_t stack_size);
+    friend void internal_resume(suspend_point_type*);
+    void recall_point();
+#endif /* __TBB_RESUMABLE_TASKS */
+};
+
+inline std::uintptr_t calculate_stealing_threshold(std::uintptr_t base, std::size_t stack_size) {
+    return base - stack_size / 2;
+}
+
+struct task_group_context_impl {
+    static void destroy(d1::task_group_context&);
+    static void initialize(d1::task_group_context&);
+    static void register_with(d1::task_group_context&, thread_data*);
+    static void bind_to_impl(d1::task_group_context&, thread_data*);
+    static void bind_to(d1::task_group_context&, thread_data*);
+    template <typename T>
+    static void propagate_task_group_state(d1::task_group_context&, std::atomic<T> d1::task_group_context::*, d1::task_group_context&, T);
+    static bool cancel_group_execution(d1::task_group_context&);
+    static bool is_group_execution_cancelled(const d1::task_group_context&);
+    static void reset(d1::task_group_context&);
+    static void capture_fp_settings(d1::task_group_context&);
+    static void copy_fp_settings(d1::task_group_context& ctx, const d1::task_group_context& src);
+};
+
+
+//! Forward declaration for scheduler entities
+bool gcc_rethrow_exception_broken();
+void fix_broken_rethrow();
+//! Forward declaration: throws std::runtime_error with what() returning error_code description prefixed with aux_info
+void handle_perror(int error_code, const char* aux_info);
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_scheduler_common_H */
diff --git a/contrib/libs/tbb/src/tbb/semaphore.cpp b/contrib/libs/tbb/src/tbb/semaphore.cpp
new file mode 100644
index 0000000000..92c9e675ab
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/semaphore.cpp
@@ -0,0 +1,92 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "semaphore.h"
+#if __TBB_USE_SRWLOCK
+#include "dynamic_link.h" // Refers to src/tbb, not include/tbb
+#error #include "tbb_misc.h"
+#endif
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+// TODO: For new win UI port, we can use SRWLock API without dynamic_link etc.
+#if __TBB_USE_SRWLOCK
+
+static std::atomic<do_once_state> concmon_module_inited;
+
+void WINAPI init_binsem_using_event( SRWLOCK* h_ )
+{
+    srwl_or_handle* shptr = (srwl_or_handle*) h_;
+    shptr->h = CreateEventEx( NULL, NULL, 0, EVENT_ALL_ACCESS|SEMAPHORE_ALL_ACCESS );
+}
+
+void WINAPI acquire_binsem_using_event( SRWLOCK* h_ )
+{
+    srwl_or_handle* shptr = (srwl_or_handle*) h_;
+    WaitForSingleObjectEx( shptr->h, INFINITE, FALSE );
+}
+
+void WINAPI release_binsem_using_event( SRWLOCK* h_ )
+{
+    srwl_or_handle* shptr = (srwl_or_handle*) h_;
+    SetEvent( shptr->h );
+}
+
+static void (WINAPI *__TBB_init_binsem)( SRWLOCK* ) = (void (WINAPI *)(SRWLOCK*))&init_binsem_using_event;
+static void (WINAPI *__TBB_acquire_binsem)( SRWLOCK* ) = (void (WINAPI *)(SRWLOCK*))&acquire_binsem_using_event;
+static void (WINAPI *__TBB_release_binsem)( SRWLOCK* ) = (void (WINAPI *)(SRWLOCK*))&release_binsem_using_event;
+
+//! Table describing the how to link the handlers.
+static const dynamic_link_descriptor SRWLLinkTable[] = {
+    DLD(InitializeSRWLock,       __TBB_init_binsem),
+    DLD(AcquireSRWLockExclusive, __TBB_acquire_binsem),
+    DLD(ReleaseSRWLockExclusive, __TBB_release_binsem)
+};
+
+inline void init_concmon_module()
+{
+    __TBB_ASSERT( (uintptr_t)__TBB_init_binsem==(uintptr_t)&init_binsem_using_event, NULL );
+    if( dynamic_link( "Kernel32.dll", SRWLLinkTable, sizeof(SRWLLinkTable)/sizeof(dynamic_link_descriptor) ) ) {
+        __TBB_ASSERT( (uintptr_t)__TBB_init_binsem!=(uintptr_t)&init_binsem_using_event, NULL );
+        __TBB_ASSERT( (uintptr_t)__TBB_acquire_binsem!=(uintptr_t)&acquire_binsem_using_event, NULL );
+        __TBB_ASSERT( (uintptr_t)__TBB_release_binsem!=(uintptr_t)&release_binsem_using_event, NULL );
+    }
+}
+
+binary_semaphore::binary_semaphore() {
+    atomic_do_once( &init_concmon_module, concmon_module_inited );
+
+    __TBB_init_binsem( &my_sem.lock );
+    if( (uintptr_t)__TBB_init_binsem!=(uintptr_t)&init_binsem_using_event )
+        P();
+}
+
+binary_semaphore::~binary_semaphore() {
+    if( (uintptr_t)__TBB_init_binsem==(uintptr_t)&init_binsem_using_event )
+        CloseHandle( my_sem.h );
+}
+
+void binary_semaphore::P() { __TBB_acquire_binsem( &my_sem.lock ); }
+
+void binary_semaphore::V() { __TBB_release_binsem( &my_sem.lock ); }
+
+#endif /* __TBB_USE_SRWLOCK */
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/contrib/libs/tbb/src/tbb/semaphore.h b/contrib/libs/tbb/src/tbb/semaphore.h
new file mode 100644
index 0000000000..0a88536e36
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/semaphore.h
@@ -0,0 +1,335 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_semaphore_H
+#define __TBB_semaphore_H
+
+#include "oneapi/tbb/detail/_utils.h"
+
+#if _WIN32||_WIN64
+#include <windows.h>
+#elif __APPLE__
+#include <mach/semaphore.h>
+#include <mach/task.h>
+#include <mach/mach_init.h>
+#include <mach/error.h>
+#else
+#include <semaphore.h>
+#ifdef TBB_USE_DEBUG
+#include <cerrno>
+#endif
+#endif /*_WIN32||_WIN64*/
+
+#include <atomic>
+
+#if __linux__ || __FreeBSD__ || __NetBSD__ || __OpenBSD__
+
+/* Futex definitions */
+#include <unistd.h>
+#include <sys/syscall.h>
+
+#if defined(SYS_futex)
+
+/* This section is included for Linux and some other systems that may support futexes.*/
+
+#define __TBB_USE_FUTEX 1
+
+#if defined(__has_include)
+#define __TBB_has_include __has_include
+#else
+#define __TBB_has_include(x) 0
+#endif
+
+/*
+If available, use typical headers where futex API is defined. While Linux and OpenBSD
+are known to provide such headers, other systems might have them as well.
+*/
+#if defined(__linux__) || __TBB_has_include(<linux/futex.h>)
+#include <linux/futex.h>
+#elif defined(__OpenBSD__) || __TBB_has_include(<sys/futex.h>)
+#error #include <sys/futex.h>
+#endif
+
+#include <climits>
+#include <cerrno>
+
+/*
+Some systems might not define the macros or use different names. In such case we expect
+the actual parameter values to match Linux: 0 for wait, 1 for wake.
+*/
+#if defined(FUTEX_WAIT_PRIVATE)
+#define __TBB_FUTEX_WAIT FUTEX_WAIT_PRIVATE
+#elif defined(FUTEX_WAIT)
+#define __TBB_FUTEX_WAIT FUTEX_WAIT
+#else
+#define __TBB_FUTEX_WAIT 0
+#endif
+
+#if defined(FUTEX_WAKE_PRIVATE)
+#define __TBB_FUTEX_WAKE FUTEX_WAKE_PRIVATE
+#elif defined(FUTEX_WAKE)
+#define __TBB_FUTEX_WAKE FUTEX_WAKE
+#else
+#define __TBB_FUTEX_WAKE 1
+#endif
+
+#endif // SYS_futex
+#endif // __linux__ || __FreeBSD__ || __NetBSD__ || __OpenBSD__
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Futex implementation
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if __TBB_USE_FUTEX
+
+static inline int futex_wait( void *futex, int comparand ) {
+    int r = ::syscall( SYS_futex,futex,__TBB_FUTEX_WAIT,comparand,NULL,NULL,0 );
+#if TBB_USE_ASSERT
+    int e = errno;
+    __TBB_ASSERT( r==0||r==EWOULDBLOCK||(r==-1&&(e==EAGAIN||e==EINTR)), "futex_wait failed." );
+#endif /* TBB_USE_ASSERT */
+    return r;
+}
+
+static inline int futex_wakeup_one( void *futex ) {
+    int r = ::syscall( SYS_futex,futex,__TBB_FUTEX_WAKE,1,NULL,NULL,0 );
+    __TBB_ASSERT( r==0||r==1, "futex_wakeup_one: more than one thread woken up?" );
+    return r;
+}
+
+// Additional possible methods that are not required right now
+// static inline int futex_wakeup_all( void *futex ) {
+//     int r = ::syscall( SYS_futex,futex,__TBB_FUTEX_WAKE,INT_MAX,NULL,NULL,0 );
+//     __TBB_ASSERT( r>=0, "futex_wakeup_all: error in waking up threads" );
+//     return r;
+// }
+
+#endif // __TBB_USE_FUTEX
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#if _WIN32||_WIN64
+typedef LONG sem_count_t;
+//! Edsger Dijkstra's counting semaphore
+class semaphore : no_copy {
+    static const int max_semaphore_cnt = MAXLONG;
+public:
+    //! ctor
+    semaphore(size_t start_cnt_ = 0) {init_semaphore(start_cnt_);}
+    //! dtor
+    ~semaphore() {CloseHandle( sem );}
+    //! wait/acquire
+    void P() {WaitForSingleObjectEx( sem, INFINITE, FALSE );}
+    //! post/release
+    void V() {ReleaseSemaphore( sem, 1, NULL );}
+private:
+    HANDLE sem;
+    void init_semaphore(size_t start_cnt_) {
+        sem = CreateSemaphoreEx( NULL, LONG(start_cnt_), max_semaphore_cnt, NULL, 0, SEMAPHORE_ALL_ACCESS );
+    }
+};
+#elif __APPLE__
+//! Edsger Dijkstra's counting semaphore
+class semaphore : no_copy {
+public:
+    //! ctor
+    semaphore(int start_cnt_ = 0) : sem(start_cnt_) { init_semaphore(start_cnt_); }
+    //! dtor
+    ~semaphore() {
+        kern_return_t ret = semaphore_destroy( mach_task_self(), sem );
+        __TBB_ASSERT_EX( ret==err_none, NULL );
+    }
+    //! wait/acquire
+    void P() {
+        int ret;
+        do {
+            ret = semaphore_wait( sem );
+        } while( ret==KERN_ABORTED );
+        __TBB_ASSERT( ret==KERN_SUCCESS, "semaphore_wait() failed" );
+    }
+    //! post/release
+    void V() { semaphore_signal( sem ); }
+private:
+    semaphore_t sem;
+    void init_semaphore(int start_cnt_) {
+        kern_return_t ret = semaphore_create( mach_task_self(), &sem, SYNC_POLICY_FIFO, start_cnt_ );
+        __TBB_ASSERT_EX( ret==err_none, "failed to create a semaphore" );
+    }
+};
+#else /* Linux/Unix */
+typedef uint32_t sem_count_t;
+//! Edsger Dijkstra's counting semaphore
+class semaphore : no_copy {
+public:
+    //! ctor
+    semaphore(int start_cnt_ = 0 ) { init_semaphore( start_cnt_ ); }
+
+    //! dtor
+    ~semaphore() {
+        int ret = sem_destroy( &sem );
+        __TBB_ASSERT_EX( !ret, NULL );
+    }
+    //! wait/acquire
+    void P() {
+        while( sem_wait( &sem )!=0 )
+            __TBB_ASSERT( errno==EINTR, NULL );
+    }
+    //! post/release
+    void V() { sem_post( &sem ); }
+private:
+    sem_t sem;
+    void init_semaphore(int start_cnt_) {
+        int ret = sem_init( &sem, /*shared among threads*/ 0, start_cnt_ );
+        __TBB_ASSERT_EX( !ret, NULL );
+    }
+};
+#endif /* _WIN32||_WIN64 */
+
+
+//! for performance reasons, we want specialized binary_semaphore
+#if _WIN32||_WIN64
+#if !__TBB_USE_SRWLOCK
+//! binary_semaphore for concurrent_monitor
+class binary_semaphore : no_copy {
+public:
+    //! ctor
+    binary_semaphore() { my_sem = CreateEventEx( NULL, NULL, 0, EVENT_ALL_ACCESS );  }
+    //! dtor
+    ~binary_semaphore() { CloseHandle( my_sem ); }
+    //! wait/acquire
+    void P() { WaitForSingleObjectEx( my_sem, INFINITE, FALSE ); }
+    //! post/release
+    void V() { SetEvent( my_sem ); }
+private:
+    HANDLE my_sem;
+};
+#else /* __TBB_USE_SRWLOCK */
+
+union srwl_or_handle {
+    SRWLOCK lock;
+    HANDLE  h;
+};
+
+//! binary_semaphore for concurrent_monitor
+class binary_semaphore : no_copy {
+public:
+    //! ctor
+    binary_semaphore();
+    //! dtor
+    ~binary_semaphore();
+    //! wait/acquire
+    void P();
+    //! post/release
+    void V();
+private:
+    srwl_or_handle my_sem;
+};
+#endif /* !__TBB_USE_SRWLOCK */
+#elif __APPLE__
+//! binary_semaphore for concurrent monitor
+class binary_semaphore : no_copy {
+public:
+    //! ctor
+    binary_semaphore() : my_sem(0) {
+        kern_return_t ret = semaphore_create( mach_task_self(), &my_sem, SYNC_POLICY_FIFO, 0 );
+        __TBB_ASSERT_EX( ret==err_none, "failed to create a semaphore" );
+    }
+    //! dtor
+    ~binary_semaphore() {
+        kern_return_t ret = semaphore_destroy( mach_task_self(), my_sem );
+        __TBB_ASSERT_EX( ret==err_none, NULL );
+    }
+    //! wait/acquire
+    void P() {
+        int ret;
+        do {
+            ret = semaphore_wait( my_sem );
+        } while( ret==KERN_ABORTED );
+        __TBB_ASSERT( ret==KERN_SUCCESS, "semaphore_wait() failed" );
+    }
+    //! post/release
+    void V() { semaphore_signal( my_sem ); }
+private:
+    semaphore_t my_sem;
+};
+#else /* Linux/Unix */
+
+#if __TBB_USE_FUTEX
+class binary_semaphore : no_copy {
+// The implementation is equivalent to the "Mutex, Take 3" one
+// in the paper "Futexes Are Tricky" by Ulrich Drepper
+public:
+    //! ctor
+    binary_semaphore() { my_sem = 1; }
+    //! dtor
+    ~binary_semaphore() {}
+    //! wait/acquire
+    void P() {
+        int s = 0;
+        if( !my_sem.compare_exchange_strong( s, 1 ) ) {
+            if( s!=2 )
+                s = my_sem.exchange( 2 );
+            while( s!=0 ) { // This loop deals with spurious wakeup
+                futex_wait( &my_sem, 2 );
+                s = my_sem.exchange( 2 );
+            }
+        }
+    }
+    //! post/release
+    void V() {
+        __TBB_ASSERT( my_sem.load(std::memory_order_relaxed)>=1, "multiple V()'s in a row?" );
+        if( my_sem.exchange( 0 )==2 )
+            futex_wakeup_one( &my_sem );
+    }
+private:
+    std::atomic<int> my_sem; // 0 - open; 1 - closed, no waits; 2 - closed, possible waits
+};
+#else
+typedef uint32_t sem_count_t;
+//! binary_semaphore for concurrent monitor
+class binary_semaphore : no_copy {
+public:
+    //! ctor
+    binary_semaphore() {
+        int ret = sem_init( &my_sem, /*shared among threads*/ 0, 0 );
+        __TBB_ASSERT_EX( !ret, NULL );
+    }
+    //! dtor
+    ~binary_semaphore() {
+        int ret = sem_destroy( &my_sem );
+        __TBB_ASSERT_EX( !ret, NULL );
+    }
+    //! wait/acquire
+    void P() {
+        while( sem_wait( &my_sem )!=0 )
+            __TBB_ASSERT( errno==EINTR, NULL );
+    }
+    //! post/release
+    void V() { sem_post( &my_sem ); }
+private:
+    sem_t my_sem;
+};
+#endif /* __TBB_USE_FUTEX */
+#endif /* _WIN32||_WIN64 */
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB_semaphore_H */
diff --git a/contrib/libs/tbb/src/tbb/small_object_pool.cpp b/contrib/libs/tbb/src/tbb/small_object_pool.cpp
new file mode 100644
index 0000000000..28d11d011d
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/small_object_pool.cpp
@@ -0,0 +1,154 @@
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "oneapi/tbb/cache_aligned_allocator.h"
+#include "oneapi/tbb/detail/_small_object_pool.h"
+#include "oneapi/tbb/detail/_task.h"
+#include "governor.h"
+#include "thread_data.h"
+#include "task_dispatcher.h"
+
+#include <cstddef>
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+small_object_pool_impl::small_object* const small_object_pool_impl::dead_public_list =
+                reinterpret_cast<small_object_pool_impl::small_object*>(1);
+
+void* __TBB_EXPORTED_FUNC allocate(d1::small_object_pool*& allocator, std::size_t number_of_bytes, const d1::execution_data& ed) {
+    auto& tls = static_cast<const execution_data_ext&>(ed).task_disp->get_thread_data();
+    auto pool = tls.my_small_object_pool;
+    return pool->allocate_impl(allocator, number_of_bytes);
+}
+
+void* __TBB_EXPORTED_FUNC allocate(d1::small_object_pool*& allocator, std::size_t number_of_bytes) {
+    // TODO: optimize if the allocator contains a valid pool.
+    auto tls = governor::get_thread_data();
+    auto pool = tls->my_small_object_pool;
+    return pool->allocate_impl(allocator, number_of_bytes);
+}
+
+void* small_object_pool_impl::allocate_impl(d1::small_object_pool*& allocator, std::size_t number_of_bytes)
+{
+    small_object* obj{nullptr};
+
+    if (number_of_bytes <= small_object_size) {
+        if (m_private_list) {
+            obj = m_private_list;
+            m_private_list = m_private_list->next;
+        } else if (m_public_list.load(std::memory_order_relaxed)) {
+            // No fence required for read of my_public_list above, because std::atomic::exchange() has a fence.
+            obj = m_public_list.exchange(nullptr);
+            __TBB_ASSERT( obj, "another thread emptied the my_public_list" );
+            m_private_list = obj->next;
+        } else {
+            obj = new (cache_aligned_allocate(small_object_size)) small_object{nullptr};
+            ++m_private_counter;
+        }
+    } else {
+        obj = new (cache_aligned_allocate(number_of_bytes)) small_object{nullptr};
+    }
+    allocator = this;
+
+    // Return uninitialized memory for further construction on user side.
+    obj->~small_object();
+    return obj;
+}
+
+void __TBB_EXPORTED_FUNC deallocate(d1::small_object_pool& allocator, void* ptr, std::size_t number_of_bytes) {
+    auto pool = static_cast<small_object_pool_impl*>(&allocator);
+    auto tls = governor::get_thread_data();
+    pool->deallocate_impl(ptr, number_of_bytes, *tls);
+}
+
+void __TBB_EXPORTED_FUNC deallocate(d1::small_object_pool& allocator, void* ptr, std::size_t number_of_bytes, const d1::execution_data& ed) {
+    auto& tls = static_cast<const execution_data_ext&>(ed).task_disp->get_thread_data();
+    auto pool = static_cast<small_object_pool_impl*>(&allocator);
+    pool->deallocate_impl(ptr, number_of_bytes, tls);
+}
+
+void small_object_pool_impl::deallocate_impl(void* ptr, std::size_t number_of_bytes, thread_data& td) {
+    __TBB_ASSERT(ptr != nullptr, "pointer to deallocate should not be null");
+    __TBB_ASSERT(number_of_bytes >= sizeof(small_object), "number of bytes should be at least sizeof(small_object)");
+
+    if (number_of_bytes <= small_object_size) {
+        auto obj = new (ptr) small_object{nullptr};
+        if (td.my_small_object_pool == this) {
+            obj->next = m_private_list;
+            m_private_list = obj;
+        } else {
+            auto old_public_list = m_public_list.load(std::memory_order_relaxed);
+
+            for (;;) {
+                if (old_public_list == dead_public_list) {
+                    obj->~small_object();
+                    cache_aligned_deallocate(obj);
+                    if (++m_public_counter == 0)
+                    {
+                        this->~small_object_pool_impl();
+                        cache_aligned_deallocate(this);
+                    }
+                    break;
+                }
+                obj->next = old_public_list;
+                if (m_public_list.compare_exchange_strong(old_public_list, obj)) {
+                    break;
+                }
+            }
+        }
+    } else {
+        cache_aligned_deallocate(ptr);
+    }
+}
+
+std::int64_t small_object_pool_impl::cleanup_list(small_object* list)
+{
+    std::int64_t removed_count{};
+
+    while (list) {
+        small_object* current = list;
+        list = list->next;
+        current->~small_object();
+        cache_aligned_deallocate(current);
+        ++removed_count;
+    }
+    return removed_count;
+}
+
+void small_object_pool_impl::destroy()
+{
+    // clean up private list and subtract the removed count from private counter
+    m_private_counter -= cleanup_list(m_private_list);
+    // Grab public list and place dead mark
+    small_object* public_list = m_public_list.exchange(dead_public_list);
+    // clean up public list and subtract from private (intentionally) counter
+    m_private_counter -= cleanup_list(public_list);
+    __TBB_ASSERT(m_private_counter >= 0, "Private counter may not be less than 0");
+    // Equivalent to fetch_sub(m_private_counter) - m_private_counter. But we need to do it
+    // atomically with operator-= not to access m_private_counter after the subtraction.
+    auto new_value = m_public_counter -= m_private_counter;
+    // check if this method is responsible to clean up the resources
+    if (new_value == 0) {
+        this->~small_object_pool_impl();
+        cache_aligned_deallocate(this);
+    }
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/contrib/libs/tbb/src/tbb/small_object_pool_impl.h b/contrib/libs/tbb/src/tbb/small_object_pool_impl.h
new file mode 100644
index 0000000000..a6b664beab
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/small_object_pool_impl.h
@@ -0,0 +1,59 @@
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_small_object_pool_impl_H
+#define __TBB_small_object_pool_impl_H
+
+#include "oneapi/tbb/detail/_small_object_pool.h"
+#include "oneapi/tbb/detail/_utils.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <atomic>
+
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class thread_data;
+
+class small_object_pool_impl : public d1::small_object_pool
+{
+    static constexpr std::size_t small_object_size = 256;
+    struct small_object {
+        small_object* next;
+    };
+    static small_object* const dead_public_list;
+public:
+    void* allocate_impl(small_object_pool*& allocator, std::size_t number_of_bytes);
+    void deallocate_impl(void* ptr, std::size_t number_of_bytes, thread_data& td);
+    void destroy();
+private:
+    static std::int64_t cleanup_list(small_object* list);
+    ~small_object_pool_impl() = default;
+private:
+    alignas(max_nfs_size) small_object* m_private_list;
+    std::int64_t m_private_counter{};
+    alignas(max_nfs_size) std::atomic<small_object*> m_public_list;
+    std::atomic<std::int64_t> m_public_counter{};
+};
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB_small_object_pool_impl_H */
diff --git a/contrib/libs/tbb/src/tbb/task.cpp b/contrib/libs/tbb/src/tbb/task.cpp
new file mode 100644
index 0000000000..129614447a
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/task.cpp
@@ -0,0 +1,225 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+// Do not include task.h directly. Use scheduler_common.h instead
+#include "scheduler_common.h"
+#include "governor.h"
+#include "arena.h"
+#include "thread_data.h"
+#include "task_dispatcher.h"
+#include "waiters.h"
+#include "itt_notify.h"
+
+#include "oneapi/tbb/detail/_task.h"
+#include "oneapi/tbb/partitioner.h"
+#include "oneapi/tbb/task.h"
+
+#include <cstring>
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//------------------------------------------------------------------------
+// resumable tasks
+//------------------------------------------------------------------------
+#if __TBB_RESUMABLE_TASKS
+
+void suspend(suspend_callback_type suspend_callback, void* user_callback) {
+    thread_data& td = *governor::get_thread_data();
+    td.my_task_dispatcher->suspend(suspend_callback, user_callback);
+    // Do not access td after suspend.
+}
+
+void resume(suspend_point_type* sp) {
+    assert_pointers_valid(sp, sp->m_arena);
+    task_dispatcher& task_disp = sp->m_resume_task.m_target;
+    __TBB_ASSERT(task_disp.m_thread_data == nullptr, nullptr);
+
+    // TODO: remove this work-around
+    // Prolong the arena's lifetime while all coroutines are alive
+    // (otherwise the arena can be destroyed while some tasks are suspended).
+    arena& a = *sp->m_arena;
+    a.my_references += arena::ref_external;
+
+    if (task_disp.m_properties.critical_task_allowed) {
+        // The target is not in the process of executing critical task, so the resume task is not critical.
+        a.my_resume_task_stream.push(&sp->m_resume_task, random_lane_selector(sp->m_random));
+    } else {
+#if __TBB_PREVIEW_CRITICAL_TASKS
+        // The target is in the process of executing critical task, so the resume task is critical.
+        a.my_critical_task_stream.push(&sp->m_resume_task, random_lane_selector(sp->m_random));
+#endif
+    }
+
+    // Do not access target after that point.
+    a.advertise_new_work<arena::wakeup>();
+
+    // Release our reference to my_arena.
+    a.on_thread_leaving<arena::ref_external>();
+}
+
+suspend_point_type* current_suspend_point() {
+    thread_data& td = *governor::get_thread_data();
+    return td.my_task_dispatcher->get_suspend_point();
+}
+
+static task_dispatcher& create_coroutine(thread_data& td) {
+    // We may have some task dispatchers cached
+    task_dispatcher* task_disp = td.my_arena->my_co_cache.pop();
+    if (!task_disp) {
+        void* ptr = cache_aligned_allocate(sizeof(task_dispatcher));
+        task_disp = new(ptr) task_dispatcher(td.my_arena);
+        task_disp->init_suspend_point(td.my_arena, td.my_arena->my_market->worker_stack_size());
+    }
+    // Prolong the arena's lifetime until all coroutines is alive
+    // (otherwise the arena can be destroyed while some tasks are suspended).
+    // TODO: consider behavior if there are more than 4K external references.
+    td.my_arena->my_references += arena::ref_external;
+    return *task_disp;
+}
+
+void task_dispatcher::suspend(suspend_callback_type suspend_callback, void* user_callback) {
+    __TBB_ASSERT(suspend_callback != nullptr, nullptr);
+    __TBB_ASSERT(user_callback != nullptr, nullptr);
+    __TBB_ASSERT(m_thread_data != nullptr, nullptr);
+
+    arena_slot* slot = m_thread_data->my_arena_slot;
+    __TBB_ASSERT(slot != nullptr, nullptr);
+
+    task_dispatcher& default_task_disp = slot->default_task_dispatcher();
+    // TODO: simplify the next line, e.g. is_task_dispatcher_recalled( task_dispatcher& )
+    bool is_recalled = default_task_disp.get_suspend_point()->m_is_owner_recalled.load(std::memory_order_acquire);
+    task_dispatcher& target = is_recalled ? default_task_disp : create_coroutine(*m_thread_data);
+
+    thread_data::suspend_callback_wrapper callback = { suspend_callback, user_callback, get_suspend_point() };
+    m_thread_data->set_post_resume_action(thread_data::post_resume_action::callback, &callback);
+    resume(target);
+
+    if (m_properties.outermost) {
+        recall_point();
+    }
+}
+
+void task_dispatcher::resume(task_dispatcher& target) {
+    // Do not create non-trivial objects on the stack of this function. They might never be destroyed
+    {
+        thread_data* td = m_thread_data;
+        __TBB_ASSERT(&target != this, "We cannot resume to ourself");
+        __TBB_ASSERT(td != nullptr, "This task dispatcher must be attach to a thread data");
+        __TBB_ASSERT(td->my_task_dispatcher == this, "Thread data must be attached to this task dispatcher");
+        __TBB_ASSERT(td->my_post_resume_action != thread_data::post_resume_action::none, "The post resume action must be set");
+        __TBB_ASSERT(td->my_post_resume_arg, "The post resume action must have an argument");
+
+        // Change the task dispatcher
+        td->detach_task_dispatcher();
+        td->attach_task_dispatcher(target);
+    }
+    __TBB_ASSERT(m_suspend_point != nullptr, "Suspend point must be created");
+    __TBB_ASSERT(target.m_suspend_point != nullptr, "Suspend point must be created");
+    // Swap to the target coroutine.
+    m_suspend_point->m_co_context.resume(target.m_suspend_point->m_co_context);
+    // Pay attention that m_thread_data can be changed after resume
+    {
+        thread_data* td = m_thread_data;
+        __TBB_ASSERT(td != nullptr, "This task dispatcher must be attach to a thread data");
+        __TBB_ASSERT(td->my_task_dispatcher == this, "Thread data must be attached to this task dispatcher");
+        td->do_post_resume_action();
+
+        // Remove the recall flag if the thread in its original task dispatcher
+        arena_slot* slot = td->my_arena_slot;
+        __TBB_ASSERT(slot != nullptr, nullptr);
+        if (this == slot->my_default_task_dispatcher) {
+            __TBB_ASSERT(m_suspend_point != nullptr, nullptr);
+            m_suspend_point->m_is_owner_recalled.store(false, std::memory_order_relaxed);
+        }
+    }
+}
+
+void thread_data::do_post_resume_action() {
+    __TBB_ASSERT(my_post_resume_action != thread_data::post_resume_action::none, "The post resume action must be set");
+    __TBB_ASSERT(my_post_resume_arg, "The post resume action must have an argument");
+
+    switch (my_post_resume_action) {
+    case post_resume_action::register_waiter:
+    {
+        static_cast<extended_concurrent_monitor::resume_context*>(my_post_resume_arg)->notify();
+        break;
+    }
+    case post_resume_action::resume:
+    {
+        r1::resume(static_cast<suspend_point_type*>(my_post_resume_arg));
+        break;
+    }
+    case post_resume_action::callback:
+    {
+        suspend_callback_wrapper callback = *static_cast<suspend_callback_wrapper*>(my_post_resume_arg);
+        callback();
+        break;
+    }
+    case post_resume_action::cleanup:
+    {
+        task_dispatcher* to_cleanup = static_cast<task_dispatcher*>(my_post_resume_arg);
+        // Release coroutine's reference to my_arena.
+        my_arena->on_thread_leaving<arena::ref_external>();
+        // Cache the coroutine for possible later re-usage
+        my_arena->my_co_cache.push(to_cleanup);
+        break;
+    }
+    case post_resume_action::notify:
+    {
+        std::atomic<bool>& owner_recall_flag = *static_cast<std::atomic<bool>*>(my_post_resume_arg);
+        owner_recall_flag.store(true, std::memory_order_release);
+        // Do not access recall_flag because it can be destroyed after the notification.
+        break;
+    }
+    default:
+        __TBB_ASSERT(false, "Unknown post resume action");
+    }
+
+    my_post_resume_action = post_resume_action::none;
+    my_post_resume_arg = nullptr;
+}
+
+#else
+
+void suspend(suspend_callback_type, void*) {
+    __TBB_ASSERT_RELEASE(false, "Resumable tasks are unsupported on this platform");
+}
+
+void resume(suspend_point_type*) {
+    __TBB_ASSERT_RELEASE(false, "Resumable tasks are unsupported on this platform");
+}
+
+suspend_point_type* current_suspend_point() {
+    __TBB_ASSERT_RELEASE(false, "Resumable tasks are unsupported on this platform");
+    return nullptr;
+}
+
+#endif /* __TBB_RESUMABLE_TASKS */
+
+void notify_waiters(std::uintptr_t wait_ctx_addr) {
+    auto is_related_wait_ctx = [&] (extended_context context) {
+        return wait_ctx_addr == context.my_uniq_addr;
+    };
+
+    r1::governor::get_thread_data()->my_arena->my_market->get_wait_list().notify(is_related_wait_ctx);
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
diff --git a/contrib/libs/tbb/src/tbb/task_dispatcher.cpp b/contrib/libs/tbb/src/tbb/task_dispatcher.cpp
new file mode 100644
index 0000000000..86818af1d1
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/task_dispatcher.cpp
@@ -0,0 +1,240 @@
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "task_dispatcher.h"
+#include "waiters.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+static inline void spawn_and_notify(d1::task& t, arena_slot* slot, arena* a) {
+    slot->spawn(t);
+    a->advertise_new_work<arena::work_spawned>();
+    // TODO: TBB_REVAMP_TODO slot->assert_task_pool_valid();
+}
+
+void __TBB_EXPORTED_FUNC spawn(d1::task& t, d1::task_group_context& ctx) {
+    thread_data* tls = governor::get_thread_data();
+    task_group_context_impl::bind_to(ctx, tls);
+    arena* a = tls->my_arena;
+    arena_slot* slot = tls->my_arena_slot;
+    // Capture current context
+    task_accessor::context(t) = &ctx;
+    // Mark isolation
+    task_accessor::isolation(t) = tls->my_task_dispatcher->m_execute_data_ext.isolation;
+    spawn_and_notify(t, slot, a);
+}
+
+void __TBB_EXPORTED_FUNC spawn(d1::task& t, d1::task_group_context& ctx, d1::slot_id id) {
+    thread_data* tls = governor::get_thread_data();
+    task_group_context_impl::bind_to(ctx, tls);
+    arena* a = tls->my_arena;
+    arena_slot* slot = tls->my_arena_slot;
+    execution_data_ext& ed = tls->my_task_dispatcher->m_execute_data_ext;
+
+    // Capture context
+    task_accessor::context(t) = &ctx;
+    // Mark isolation
+    task_accessor::isolation(t) = ed.isolation;
+
+    if ( id != d1::no_slot && id != tls->my_arena_index ) {
+        // Allocate proxy task
+        d1::small_object_allocator alloc{};
+        auto proxy = alloc.new_object<task_proxy>(static_cast<d1::execution_data&>(ed));
+        // Mark as a proxy
+        task_accessor::set_proxy_trait(*proxy);
+        // Mark isolation for the proxy task
+        task_accessor::isolation(*proxy) = ed.isolation;
+        // Deallocation hint (tls) from the task allocator
+        proxy->allocator = alloc;
+        proxy->slot = id;
+        proxy->outbox = &a->mailbox(id);
+        // Mark proxy as present in both locations (sender's task pool and destination mailbox)
+        proxy->task_and_tag = intptr_t(&t) | task_proxy::location_mask;
+        // Mail the proxy - after this point t may be destroyed by another thread at any moment.
+        proxy->outbox->push(proxy);
+        // Spawn proxy to the local task pool
+        spawn_and_notify(*proxy, slot, a);
+    } else {
+        spawn_and_notify(t, slot, a);
+    }
+}
+
+void __TBB_EXPORTED_FUNC submit(d1::task& t, d1::task_group_context& ctx, arena* a, std::uintptr_t as_critical) {
+    suppress_unused_warning(as_critical);
+    assert_pointer_valid(a);
+    thread_data& tls = *governor::get_thread_data();
+
+    // TODO revamp: for each use case investigate neccesity to make this call
+    task_group_context_impl::bind_to(ctx, &tls);
+    task_accessor::context(t) = &ctx;
+    // TODO revamp: consider respecting task isolation if this call is being made by external thread
+    task_accessor::isolation(t) = tls.my_task_dispatcher->m_execute_data_ext.isolation;
+
+    // TODO: consider code refactoring when lane selection mechanism is unified.
+
+    if ( tls.is_attached_to(a) ) {
+        arena_slot* slot = tls.my_arena_slot;
+#if __TBB_PREVIEW_CRITICAL_TASKS
+        if( as_critical ) {
+            a->my_critical_task_stream.push( &t, subsequent_lane_selector(slot->critical_hint()) );
+        } else
+#endif
+        {
+            slot->spawn(t);
+        }
+    } else {
+        random_lane_selector lane_selector{tls.my_random};
+#if !__TBB_PREVIEW_CRITICAL_TASKS
+        suppress_unused_warning(as_critical);
+#else
+        if ( as_critical ) {
+            a->my_critical_task_stream.push( &t, lane_selector );
+        } else
+#endif
+        {
+            // Avoid joining the arena the thread is not currently in.
+            a->my_fifo_task_stream.push( &t, lane_selector );
+        }
+    }
+    // It is assumed that some thread will explicitly wait in the arena the task is submitted
+    // into. Therefore, no need to utilize mandatory concurrency here.
+    a->advertise_new_work<arena::work_spawned>();
+}
+
+void __TBB_EXPORTED_FUNC execute_and_wait(d1::task& t, d1::task_group_context& t_ctx, d1::wait_context& wait_ctx, d1::task_group_context& w_ctx) {
+    task_accessor::context(t) = &t_ctx;
+    task_dispatcher::execute_and_wait(&t, wait_ctx, w_ctx);
+}
+
+void __TBB_EXPORTED_FUNC wait(d1::wait_context& wait_ctx, d1::task_group_context& w_ctx) {
+    // Enter the task dispatch loop without a task
+    task_dispatcher::execute_and_wait(nullptr, wait_ctx, w_ctx);
+}
+
+d1::slot_id __TBB_EXPORTED_FUNC execution_slot(const d1::execution_data* ed) {
+    if (ed) {
+        const execution_data_ext* ed_ext = static_cast<const execution_data_ext*>(ed);
+        assert_pointers_valid(ed_ext->task_disp, ed_ext->task_disp->m_thread_data);
+        return ed_ext->task_disp->m_thread_data->my_arena_index;
+    } else {
+        thread_data* td = governor::get_thread_data_if_initialized();
+        return td ? int(td->my_arena_index) : -1;
+    }
+}
+
+d1::task_group_context* __TBB_EXPORTED_FUNC current_context() {
+    thread_data* td = governor::get_thread_data();
+    assert_pointers_valid(td, td->my_task_dispatcher);
+
+    task_dispatcher* task_disp = td->my_task_dispatcher;
+    if (task_disp->m_properties.outermost) {
+        // No one task is executed, so no execute_data.
+        return nullptr;
+    } else {
+        return td->my_task_dispatcher->m_execute_data_ext.context;
+    }
+}
+
+void task_dispatcher::execute_and_wait(d1::task* t, d1::wait_context& wait_ctx, d1::task_group_context& w_ctx) {
+    // Get an associated task dispatcher
+    thread_data* tls = governor::get_thread_data();
+    __TBB_ASSERT(tls->my_task_dispatcher != nullptr, nullptr);
+    task_dispatcher& local_td = *tls->my_task_dispatcher;
+
+    // TODO: factor out the binding to execute_and_wait_impl
+    if (t) {
+        task_group_context_impl::bind_to(*task_accessor::context(*t), tls);
+        // Propagate the isolation to the task executed without spawn.
+        task_accessor::isolation(*t) = tls->my_task_dispatcher->m_execute_data_ext.isolation;
+    }
+
+    // Waiting on special object tied to a waiting thread.
+    external_waiter waiter{ *tls->my_arena, wait_ctx };
+    t = local_td.local_wait_for_all(t, waiter);
+    __TBB_ASSERT_EX(t == nullptr, "External waiter must not leave dispatch loop with a task");
+
+    // The external thread couldn't exit the dispatch loop in an idle state
+    if (local_td.m_thread_data->my_inbox.is_idle_state(true)) {
+        local_td.m_thread_data->my_inbox.set_is_idle(false);
+    }
+
+    if (w_ctx.my_exception) {
+        __TBB_ASSERT(w_ctx.is_group_execution_cancelled(), "The task group context with an exception should be canceled.");
+        w_ctx.my_exception->throw_self();
+    }
+}
+
+#if __TBB_RESUMABLE_TASKS
+
+#if _WIN32
+/* [[noreturn]] */ void __stdcall co_local_wait_for_all(void* arg) noexcept
+#else
+/* [[noreturn]] */ void co_local_wait_for_all(void* arg)  noexcept
+#endif
+{
+    // Do not create non-trivial objects on the stack of this function. They will never be destroyed.
+    __TBB_ASSERT(arg != nullptr, nullptr);
+    task_dispatcher& task_disp = *static_cast<task_dispatcher*>(arg);
+
+    assert_pointers_valid(task_disp.m_thread_data, task_disp.m_thread_data->my_arena);
+    task_disp.set_stealing_threshold(task_disp.m_thread_data->my_arena->calculate_stealing_threshold());
+    __TBB_ASSERT(task_disp.can_steal(), nullptr);
+    task_disp.co_local_wait_for_all();
+    // This code is unreachable
+}
+
+/* [[noreturn]] */ void task_dispatcher::co_local_wait_for_all() noexcept {
+    // Do not create non-trivial objects on the stack of this function. They will never be destroyed.
+    assert_pointer_valid(m_thread_data);
+
+    // Basically calls the user callback passed to the tbb::task::suspend function
+    m_thread_data->do_post_resume_action();
+
+    // Endless loop here because coroutine could be reused
+    for (;;) {
+        arena* a = m_thread_data->my_arena;
+        coroutine_waiter waiter(*a);
+        d1::task* resume_task = local_wait_for_all(nullptr, waiter);
+        assert_task_valid(resume_task);
+        __TBB_ASSERT(this == m_thread_data->my_task_dispatcher, nullptr);
+
+        m_thread_data->set_post_resume_action(thread_data::post_resume_action::cleanup, this);
+        resume(static_cast<suspend_point_type::resume_task*>(resume_task)->m_target);
+    }
+    // This code is unreachable
+}
+
+d1::suspend_point task_dispatcher::get_suspend_point() {
+    if (m_suspend_point == nullptr) {
+        assert_pointer_valid(m_thread_data);
+        // 0 means that we attach this task dispatcher to the current stack
+        init_suspend_point(m_thread_data->my_arena, 0);
+    }
+    assert_pointer_valid(m_suspend_point);
+    return m_suspend_point;
+}
+void task_dispatcher::init_suspend_point(arena* a, std::size_t stack_size) {
+    __TBB_ASSERT(m_suspend_point == nullptr, nullptr);
+    m_suspend_point = new(cache_aligned_allocate(sizeof(suspend_point_type)))
+        suspend_point_type(a, stack_size, *this);
+}
+#endif /* __TBB_RESUMABLE_TASKS */
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
diff --git a/contrib/libs/tbb/src/tbb/task_dispatcher.h b/contrib/libs/tbb/src/tbb/task_dispatcher.h
new file mode 100644
index 0000000000..54a6c0d934
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/task_dispatcher.h
@@ -0,0 +1,465 @@
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_task_dispatcher_H
+#define _TBB_task_dispatcher_H
+
+#include "oneapi/tbb/detail/_utils.h"
+#include "oneapi/tbb/detail/_task.h"
+#include "oneapi/tbb/global_control.h"
+
+#include "scheduler_common.h"
+#include "waiters.h"
+#include "arena_slot.h"
+#include "arena.h"
+#include "thread_data.h"
+#include "mailbox.h"
+#include "itt_notify.h"
+#include "concurrent_monitor.h"
+
+#include <atomic>
+
+#if !__TBB_CPU_CTL_ENV_PRESENT
+#include <fenv.h> //
+#endif
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+inline d1::task* get_self_recall_task(arena_slot& slot) {
+    suppress_unused_warning(slot);
+    d1::task* t = nullptr;
+#if __TBB_RESUMABLE_TASKS
+    suspend_point_type* sp = slot.default_task_dispatcher().m_suspend_point;
+    if (sp && sp->m_is_owner_recalled.load(std::memory_order_acquire)) {
+        t = &sp->m_resume_task;
+        __TBB_ASSERT(sp->m_resume_task.m_target.m_thread_data == nullptr, nullptr);
+    }
+#endif /* __TBB_RESUMABLE_TASKS */
+    return t;
+}
+
+// Defined in exception.cpp
+/*[[noreturn]]*/void do_throw_noexcept(void (*throw_exception)()) noexcept;
+
+//------------------------------------------------------------------------
+// Suspend point
+//------------------------------------------------------------------------
+#if __TBB_RESUMABLE_TASKS
+
+inline d1::task* suspend_point_type::resume_task::execute(d1::execution_data& ed) {
+    execution_data_ext& ed_ext = static_cast<execution_data_ext&>(ed);
+
+    if (ed_ext.wait_ctx) {
+        extended_concurrent_monitor::resume_context monitor_node{{std::uintptr_t(ed_ext.wait_ctx), nullptr}, ed_ext, m_target};
+        // The wait_ctx is present only in external_waiter. In that case we leave the current stack
+        // in the abandoned state to resume when waiting completes.
+        thread_data* td = ed_ext.task_disp->m_thread_data;
+        td->set_post_resume_action(thread_data::post_resume_action::register_waiter, &monitor_node);
+
+        extended_concurrent_monitor& wait_list = td->my_arena->my_market->get_wait_list();
+
+        if (wait_list.wait([&] { return !ed_ext.wait_ctx->continue_execution(); }, monitor_node)) {
+            return nullptr;
+        }
+
+        td->clear_post_resume_action();
+        td->set_post_resume_action(thread_data::post_resume_action::resume, ed_ext.task_disp->get_suspend_point());
+    } else {
+        // If wait_ctx is null, it can be only a worker thread on outermost level because
+        // coroutine_waiter interrupts bypass loop before the resume_task execution.
+        ed_ext.task_disp->m_thread_data->set_post_resume_action(thread_data::post_resume_action::notify,
+            &ed_ext.task_disp->get_suspend_point()->m_is_owner_recalled);
+    }
+    // Do not access this task because it might be destroyed
+    ed_ext.task_disp->resume(m_target);
+    return nullptr;
+}
+
+inline suspend_point_type::suspend_point_type(arena* a, size_t stack_size, task_dispatcher& task_disp)
+    : m_arena(a)
+    , m_random(this)
+    , m_co_context(stack_size, &task_disp)
+    , m_resume_task(task_disp)
+{
+    assert_pointer_valid(m_arena);
+    assert_pointer_valid(m_arena->my_default_ctx);
+    task_accessor::context(m_resume_task) = m_arena->my_default_ctx;
+    task_accessor::isolation(m_resume_task) = no_isolation;
+    // Initialize the itt_caller for the context of the resume task.
+    // It will be bound to the stack of the first suspend call.
+    task_group_context_impl::bind_to(*task_accessor::context(m_resume_task), task_disp.m_thread_data);
+}
+
+#endif /* __TBB_RESUMABLE_TASKS */
+
+//------------------------------------------------------------------------
+// Task Dispatcher
+//------------------------------------------------------------------------
+inline task_dispatcher::task_dispatcher(arena* a) {
+    m_execute_data_ext.context = a->my_default_ctx;
+    m_execute_data_ext.task_disp = this;
+}
+
+inline bool task_dispatcher::can_steal() {
+    __TBB_ASSERT(m_stealing_threshold != 0, nullptr);
+    stack_anchor_type anchor{};
+    return reinterpret_cast<std::uintptr_t>(&anchor) > m_stealing_threshold;
+}
+
+inline d1::task* task_dispatcher::get_inbox_or_critical_task(
+    execution_data_ext& ed, mail_inbox& inbox, isolation_type isolation, bool critical_allowed)
+{
+    if (inbox.empty())
+        return nullptr;
+    d1::task* result = get_critical_task(nullptr, ed, isolation, critical_allowed);
+    if (result)
+        return result;
+    // Check if there are tasks mailed to this thread via task-to-thread affinity mechanism.
+    result = get_mailbox_task(inbox, ed, isolation);
+    // There is a race with a thread adding a new task (possibly with suitable isolation)
+    // to our mailbox, so the below conditions might result in a false positive.
+    // Then set_is_idle(false) allows that task to be stolen; it's OK.
+    if (isolation != no_isolation && !result && !inbox.empty() && inbox.is_idle_state(true)) {
+        // We have proxy tasks in our mailbox but the isolation blocks their execution.
+        // So publish the proxy tasks in mailbox to be available for stealing from owner's task pool.
+        inbox.set_is_idle( false );
+    }
+    return result;
+}
+
+inline d1::task* task_dispatcher::get_stream_or_critical_task(
+    execution_data_ext& ed, arena& a, task_stream<front_accessor>& stream, unsigned& hint,
+    isolation_type isolation, bool critical_allowed)
+{
+    if (stream.empty())
+        return nullptr;
+    d1::task* result = get_critical_task(nullptr, ed, isolation, critical_allowed);
+    if (result)
+        return result;
+    return a.get_stream_task(stream, hint);
+}
+
+inline d1::task* task_dispatcher::steal_or_get_critical(
+    execution_data_ext& ed, arena& a, unsigned arena_index, FastRandom& random,
+    isolation_type isolation, bool critical_allowed)
+{
+    if (d1::task* t = a.steal_task(arena_index, random, ed, isolation)) {
+        ed.context = task_accessor::context(*t);
+        ed.isolation = task_accessor::isolation(*t);
+        return get_critical_task(t, ed, isolation, critical_allowed);
+    }
+    return nullptr;
+}
+
+template <bool ITTPossible, typename Waiter>
+d1::task* task_dispatcher::receive_or_steal_task(
+    thread_data& tls, execution_data_ext& ed, Waiter& waiter, isolation_type isolation,
+    bool fifo_allowed, bool critical_allowed)
+{
+    __TBB_ASSERT(governor::is_thread_data_set(&tls), NULL);
+    // Task to return
+    d1::task* t = nullptr;
+    // Get tls data (again)
+    arena& a = *tls.my_arena;
+    arena_slot& slot = *tls.my_arena_slot;
+    unsigned arena_index = tls.my_arena_index;
+    mail_inbox& inbox = tls.my_inbox;
+    task_stream<front_accessor>& resume_stream = a.my_resume_task_stream;
+    unsigned& resume_hint = slot.hint_for_resume_stream;
+    task_stream<front_accessor>& fifo_stream = a.my_fifo_task_stream;
+    unsigned& fifo_hint = slot.hint_for_fifo_stream;
+
+    waiter.reset_wait();
+    // Thread is in idle state now
+    inbox.set_is_idle(true);
+
+    bool stealing_is_allowed = can_steal();
+
+    // Stealing loop mailbox/enqueue/other_slots
+    for (;;) {
+        __TBB_ASSERT(t == nullptr, nullptr);
+        // Check if the resource manager requires our arena to relinquish some threads
+        // For the external thread restore idle state to true after dispatch loop
+        if (!waiter.continue_execution(slot, t)) {
+            __TBB_ASSERT(t == nullptr, nullptr);
+            break;
+        }
+        // Start searching
+        if (t != nullptr) {
+            // continue_execution returned a task
+        }
+        else if ((t = get_inbox_or_critical_task(ed, inbox, isolation, critical_allowed))) {
+            // Successfully got the task from mailbox or critical task
+        }
+        else if ((t = get_stream_or_critical_task(ed, a, resume_stream, resume_hint, isolation, critical_allowed))) {
+            // Successfully got the resume or critical task
+        }
+        else if (fifo_allowed && isolation == no_isolation
+                 && (t = get_stream_or_critical_task(ed, a, fifo_stream, fifo_hint, isolation, critical_allowed))) {
+            // Checked if there are tasks in starvation-resistant stream. Only allowed at the outermost dispatch level without isolation.
+        }
+        else if (stealing_is_allowed
+                 && (t = steal_or_get_critical(ed, a, arena_index, tls.my_random, isolation, critical_allowed))) {
+            // Stole a task from a random arena slot
+        }
+        else {
+            t = get_critical_task(t, ed, isolation, critical_allowed);
+        }
+
+        if (t != nullptr) {
+            ed.context = task_accessor::context(*t);
+            ed.isolation = task_accessor::isolation(*t);
+            a.my_observers.notify_entry_observers(tls.my_last_observer, tls.my_is_worker);
+            break; // Stealing success, end of stealing attempt
+        }
+        // Nothing to do, pause a little.
+        waiter.pause(slot);
+    } // end of nonlocal task retrieval loop
+    if (inbox.is_idle_state(true)) {
+        inbox.set_is_idle(false);
+    }
+    return t;
+}
+
+template <bool ITTPossible, typename Waiter>
+d1::task* task_dispatcher::local_wait_for_all(d1::task* t, Waiter& waiter ) {
+    assert_pointer_valid(m_thread_data);
+    __TBB_ASSERT(m_thread_data->my_task_dispatcher == this, nullptr);
+
+    // Guard an outer/default execution state
+    struct dispatch_loop_guard {
+        task_dispatcher& task_disp;
+        execution_data_ext old_execute_data_ext;
+        properties old_properties;
+
+        ~dispatch_loop_guard() {
+            task_disp.m_execute_data_ext = old_execute_data_ext;
+            task_disp.m_properties = old_properties;
+
+            __TBB_ASSERT(task_disp.m_thread_data && governor::is_thread_data_set(task_disp.m_thread_data), nullptr);
+            __TBB_ASSERT(task_disp.m_thread_data->my_task_dispatcher == &task_disp, nullptr);
+        }
+    } dl_guard{ *this, m_execute_data_ext, m_properties };
+
+    // The context guard to track fp setting and itt tasks.
+    context_guard_helper</*report_tasks=*/ITTPossible> context_guard;
+
+    // Current isolation context
+    const isolation_type isolation = dl_guard.old_execute_data_ext.isolation;
+
+    // Critical work inflection point. Once turned false current execution context has taken
+    // critical task on the previous stack frame and cannot take more until that critical path is
+    // finished.
+    bool critical_allowed = dl_guard.old_properties.critical_task_allowed;
+
+    // Extended execution data that is used for dispatching.
+    // Base version is passed to the task::execute method.
+    execution_data_ext& ed = m_execute_data_ext;
+    ed.context = t ? task_accessor::context(*t) : nullptr;
+    ed.original_slot = m_thread_data->my_arena_index;
+    ed.affinity_slot = d1::no_slot;
+    ed.task_disp = this;
+    ed.wait_ctx = waiter.wait_ctx();
+
+    m_properties.outermost = false;
+    m_properties.fifo_tasks_allowed = false;
+
+    t = get_critical_task(t, ed, isolation, critical_allowed);
+
+    // Infinite exception loop
+    for (;;) {
+        try {
+            // Main execution loop
+            do {
+                // We assume that bypass tasks are from the same task group.
+                context_guard.set_ctx(ed.context);
+                // Inner level evaluates tasks coming from nesting loops and those returned
+                // by just executed tasks (bypassing spawn or enqueue calls).
+                while (t != nullptr) {
+                    assert_task_valid(t);
+                    assert_pointer_valid</*alignment = */alignof(void*)>(ed.context);
+                    __TBB_ASSERT(ed.context->my_lifetime_state > d1::task_group_context::lifetime_state::locked &&
+                                 ed.context->my_lifetime_state < d1::task_group_context::lifetime_state::dying, nullptr);
+                    __TBB_ASSERT(m_thread_data->my_inbox.is_idle_state(false), nullptr);
+                    __TBB_ASSERT(task_accessor::is_resume_task(*t) || isolation == no_isolation || isolation == ed.isolation, nullptr);
+                    // Check premature leave
+                    if (Waiter::postpone_execution(*t)) {
+                        __TBB_ASSERT(task_accessor::is_resume_task(*t) && dl_guard.old_properties.outermost,
+                            "Currently, the bypass loop can be interrupted only for resume task on outermost level");
+                        return t;
+                    }
+                    // Copy itt_caller to a stack because the context might be destroyed after t->execute.
+                    void* itt_caller = ed.context->my_itt_caller;
+                    suppress_unused_warning(itt_caller);
+
+                    ITT_CALLEE_ENTER(ITTPossible, t, itt_caller);
+
+                    if (ed.context->is_group_execution_cancelled()) {
+                        t = t->cancel(ed);
+                    } else {
+                        t = t->execute(ed);
+                    }
+
+                    ITT_CALLEE_LEAVE(ITTPossible, itt_caller);
+
+                    // The task affinity in execution data is set for affinitized tasks.
+                    // So drop it after the task execution.
+                    ed.affinity_slot = d1::no_slot;
+                    // Reset task owner id for bypassed task
+                    ed.original_slot = m_thread_data->my_arena_index;
+                    t = get_critical_task(t, ed, isolation, critical_allowed);
+                }
+                __TBB_ASSERT(m_thread_data && governor::is_thread_data_set(m_thread_data), nullptr);
+                __TBB_ASSERT(m_thread_data->my_task_dispatcher == this, nullptr);
+                // When refactoring, pay attention that m_thread_data can be changed after t->execute()
+                __TBB_ASSERT(m_thread_data->my_arena_slot != nullptr, nullptr);
+                arena_slot& slot = *m_thread_data->my_arena_slot;
+                if (!waiter.continue_execution(slot, t)) {
+                    break;
+                }
+                // Retrieve the task from local task pool
+                if (t || (slot.is_task_pool_published() && (t = slot.get_task(ed, isolation)))) {
+                    __TBB_ASSERT(ed.original_slot == m_thread_data->my_arena_index, NULL);
+                    ed.context = task_accessor::context(*t);
+                    ed.isolation = task_accessor::isolation(*t);
+                    continue;
+                }
+                // Retrieve the task from global sources
+                t = receive_or_steal_task<ITTPossible>(
+                    *m_thread_data, ed, waiter, isolation, dl_guard.old_properties.fifo_tasks_allowed,
+                    critical_allowed
+                );
+            } while (t != nullptr); // main dispatch loop
+            break; // Exit exception loop;
+        } catch (...) {
+            if (global_control::active_value(global_control::terminate_on_exception) == 1) {
+                do_throw_noexcept([] { throw; });
+            }
+            if (ed.context->cancel_group_execution()) {
+                /* We are the first to signal cancellation, so store the exception that caused it. */
+                ed.context->my_exception = tbb_exception_ptr::allocate();
+            }
+        }
+    } // Infinite exception loop
+    __TBB_ASSERT(t == nullptr, nullptr);
+
+
+#if __TBB_RESUMABLE_TASKS
+    if (dl_guard.old_properties.outermost) {
+        recall_point();
+    }
+#endif /* __TBB_RESUMABLE_TASKS */
+
+    return nullptr;
+}
+
+#if __TBB_RESUMABLE_TASKS
+inline void task_dispatcher::recall_point() {
+    if (this != &m_thread_data->my_arena_slot->default_task_dispatcher()) {
+        __TBB_ASSERT(m_suspend_point != nullptr, nullptr);
+        __TBB_ASSERT(m_suspend_point->m_is_owner_recalled.load(std::memory_order_relaxed) == false, nullptr);
+        d1::suspend([](suspend_point_type* sp) {
+            sp->m_is_owner_recalled.store(true, std::memory_order_release);
+            auto is_related_suspend_point = [sp] (extended_context context) {
+                std::uintptr_t sp_addr = std::uintptr_t(sp);
+                return sp_addr == context.my_uniq_addr;
+            };
+            sp->m_arena->my_market->get_wait_list().notify(is_related_suspend_point);
+        });
+
+        if (m_thread_data->my_inbox.is_idle_state(true)) {
+            m_thread_data->my_inbox.set_is_idle(false);
+        }
+    }
+}
+#endif /* __TBB_RESUMABLE_TASKS */
+
+#if __TBB_PREVIEW_CRITICAL_TASKS
+inline d1::task* task_dispatcher::get_critical_task(d1::task* t, execution_data_ext& ed, isolation_type isolation, bool critical_allowed) {
+    __TBB_ASSERT( critical_allowed || !m_properties.critical_task_allowed, nullptr );
+
+    if (!critical_allowed) {
+        // The stack is already in the process of critical path execution. Cannot take another
+        // critical work until finish with the current one.
+        __TBB_ASSERT(!m_properties.critical_task_allowed, nullptr);
+        return t;
+    }
+
+    assert_pointers_valid(m_thread_data, m_thread_data->my_arena, m_thread_data->my_arena_slot);
+    thread_data& td = *m_thread_data;
+    arena& a = *td.my_arena;
+    arena_slot& slot = *td.my_arena_slot;
+
+    d1::task* crit_t = a.get_critical_task(slot.hint_for_critical_stream, isolation);
+    if (crit_t != nullptr) {
+        assert_task_valid(crit_t);
+        if (t != nullptr) {
+            assert_pointer_valid</*alignment = */alignof(void*)>(ed.context);
+            r1::spawn(*t, *ed.context);
+        }
+        ed.context = task_accessor::context(*crit_t);
+        ed.isolation = task_accessor::isolation(*crit_t);
+
+        // We cannot execute more than one critical task on the same stack.
+        // In other words, we prevent nested critical tasks.
+        m_properties.critical_task_allowed = false;
+
+        // TODO: add a test that the observer is called when critical task is taken.
+        a.my_observers.notify_entry_observers(td.my_last_observer, td.my_is_worker);
+        t = crit_t;
+    } else {
+        // Was unable to find critical work in the queue. Allow inspecting the queue in nested
+        // invocations. Handles the case when critical task has been just completed.
+        m_properties.critical_task_allowed = true;
+    }
+    return t;
+}
+#else
+inline d1::task* task_dispatcher::get_critical_task(d1::task* t, execution_data_ext&, isolation_type, bool /*critical_allowed*/) {
+    return t;
+}
+#endif
+
+inline d1::task* task_dispatcher::get_mailbox_task(mail_inbox& my_inbox, execution_data_ext& ed, isolation_type isolation) {
+    while (task_proxy* const tp = my_inbox.pop(isolation)) {
+        if (d1::task* result = tp->extract_task<task_proxy::mailbox_bit>()) {
+            ed.original_slot = (unsigned short)(-2);
+            ed.affinity_slot = ed.task_disp->m_thread_data->my_arena_index;
+            return result;
+        }
+        // We have exclusive access to the proxy, and can destroy it.
+        tp->allocator.delete_object(tp, ed);
+    }
+    return NULL;
+}
+
+template <typename Waiter>
+d1::task* task_dispatcher::local_wait_for_all(d1::task* t, Waiter& waiter) {
+    if (governor::is_itt_present()) {
+        return local_wait_for_all</*ITTPossible = */ true>(t, waiter);
+    } else {
+        return local_wait_for_all</*ITTPossible = */ false>(t, waiter);
+    }
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif // _TBB_task_dispatcher_H
+
diff --git a/contrib/libs/tbb/src/tbb/task_group_context.cpp b/contrib/libs/tbb/src/tbb/task_group_context.cpp
new file mode 100644
index 0000000000..3c296648ec
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/task_group_context.cpp
@@ -0,0 +1,493 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "oneapi/tbb/detail/_config.h"
+#include "oneapi/tbb/tbb_allocator.h"
+#include "oneapi/tbb/task_group.h"
+#include "governor.h"
+#include "thread_data.h"
+#include "scheduler_common.h"
+#include "itt_notify.h"
+#include "task_dispatcher.h"
+
+#include <type_traits>
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//------------------------------------------------------------------------
+// tbb_exception_ptr
+//------------------------------------------------------------------------
+tbb_exception_ptr* tbb_exception_ptr::allocate() noexcept {
+    tbb_exception_ptr* eptr = (tbb_exception_ptr*)allocate_memory(sizeof(tbb_exception_ptr));
+    return eptr ? new (eptr) tbb_exception_ptr(std::current_exception()) : nullptr;
+}
+
+void tbb_exception_ptr::destroy() noexcept {
+    this->~tbb_exception_ptr();
+    deallocate_memory(this);
+}
+
+void tbb_exception_ptr::throw_self() {
+    if (governor::rethrow_exception_broken()) fix_broken_rethrow();
+    std::rethrow_exception(my_ptr);
+}
+
+//------------------------------------------------------------------------
+// task_group_context
+//------------------------------------------------------------------------
+
+void task_group_context_impl::destroy(d1::task_group_context& ctx) {
+    __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
+
+    auto ctx_lifetime_state = ctx.my_lifetime_state.load(std::memory_order_relaxed);
+    __TBB_ASSERT(ctx_lifetime_state != d1::task_group_context::lifetime_state::locked, nullptr);
+
+    if (ctx_lifetime_state == d1::task_group_context::lifetime_state::bound) {
+        // The owner can be destroyed at any moment. Access the associate data with caution.
+        thread_data* owner = ctx.my_owner.load(std::memory_order_relaxed);
+        if (governor::is_thread_data_set(owner)) {
+            thread_data::context_list_state& cls = owner->my_context_list_state;
+            // We are the owner, so cls is valid.
+            // Local update of the context list
+            std::uintptr_t local_count_snapshot = cls.epoch.load(std::memory_order_relaxed);
+            // The sequentially-consistent store to prevent load of nonlocal update flag
+            // from being hoisted before the store to local update flag.
+            cls.local_update = 1;
+            if (cls.nonlocal_update.load(std::memory_order_relaxed)) {
+                spin_mutex::scoped_lock lock(cls.mutex);
+                ctx.my_node.remove_relaxed();
+                cls.local_update.store(0, std::memory_order_relaxed);
+            } else {
+                ctx.my_node.remove_relaxed();
+                // Release fence is necessary so that update of our neighbors in
+                // the context list was committed when possible concurrent destroyer
+                // proceeds after local update flag is reset by the following store.
+                cls.local_update.store(0, std::memory_order_release);
+                if (local_count_snapshot != the_context_state_propagation_epoch.load(std::memory_order_relaxed)) {
+                    // Another thread was propagating cancellation request when we removed
+                    // ourselves from the list. We must ensure that it is not accessing us
+                    // when this destructor finishes. We'll be able to acquire the lock
+                    // below only after the other thread finishes with us.
+                    spin_mutex::scoped_lock lock(cls.mutex);
+                }
+            }
+        } else {
+            d1::task_group_context::lifetime_state expected = d1::task_group_context::lifetime_state::bound;
+            if (
+#if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1910
+                !((std::atomic<typename std::underlying_type<d1::task_group_context::lifetime_state>::type>&)ctx.my_lifetime_state).compare_exchange_strong(
+                    (typename std::underlying_type<d1::task_group_context::lifetime_state>::type&)expected,
+                    (typename std::underlying_type<d1::task_group_context::lifetime_state>::type)d1::task_group_context::lifetime_state::locked)
+#else
+                !ctx.my_lifetime_state.compare_exchange_strong(expected, d1::task_group_context::lifetime_state::locked)
+#endif
+                ) {
+                __TBB_ASSERT(expected == d1::task_group_context::lifetime_state::detached, nullptr);
+                // The "owner" local variable can be a dangling pointer here. Do not access it.
+                owner = nullptr;
+                spin_wait_until_eq(ctx.my_owner, nullptr);
+                // It is unsafe to remove the node because its neighbors might be already destroyed.
+                // TODO: reconsider the logic.
+                // ctx.my_node.remove_relaxed();
+            }
+            else {
+                __TBB_ASSERT(expected == d1::task_group_context::lifetime_state::bound, nullptr);
+                __TBB_ASSERT(ctx.my_owner.load(std::memory_order_relaxed) != nullptr, nullptr);
+                thread_data::context_list_state& cls = owner->my_context_list_state;
+                __TBB_ASSERT(is_alive(cls.nonlocal_update.load(std::memory_order_relaxed)), "The owner should be alive.");
+
+                ++cls.nonlocal_update;
+                ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::dying, std::memory_order_release);
+                spin_wait_until_eq(cls.local_update, 0u);
+                {
+                    spin_mutex::scoped_lock lock(cls.mutex);
+                    ctx.my_node.remove_relaxed();
+                }
+                --cls.nonlocal_update;
+            }
+        }
+    }
+
+    if (ctx_lifetime_state == d1::task_group_context::lifetime_state::detached) {
+        spin_wait_until_eq(ctx.my_owner, nullptr);
+    }
+
+    d1::cpu_ctl_env* ctl = reinterpret_cast<d1::cpu_ctl_env*>(&ctx.my_cpu_ctl_env);
+#if _MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER
+    suppress_unused_warning(ctl);
+#endif
+    ctl->~cpu_ctl_env();
+
+    if (ctx.my_exception)
+        ctx.my_exception->destroy();
+    ITT_STACK_DESTROY(ctx.my_itt_caller);
+
+    poison_pointer(ctx.my_parent);
+    poison_pointer(ctx.my_parent);
+    poison_pointer(ctx.my_owner);
+    poison_pointer(ctx.my_node.next);
+    poison_pointer(ctx.my_node.prev);
+    poison_pointer(ctx.my_exception);
+    poison_pointer(ctx.my_itt_caller);
+}
+
+void task_group_context_impl::initialize(d1::task_group_context& ctx) {
+    ITT_TASK_GROUP(&ctx, ctx.my_name, nullptr);
+
+    ctx.my_cpu_ctl_env = 0;
+    ctx.my_cancellation_requested = 0;
+    ctx.my_state.store(0, std::memory_order_relaxed);
+    // Set the created state to bound at the first usage.
+    ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::created, std::memory_order_relaxed);
+    ctx.my_parent = nullptr;
+    ctx.my_owner = nullptr;
+    ctx.my_node.next.store(nullptr, std::memory_order_relaxed);
+    ctx.my_node.next.store(nullptr, std::memory_order_relaxed);
+    ctx.my_exception = nullptr;
+    ctx.my_itt_caller = nullptr;
+
+    static_assert(sizeof(d1::cpu_ctl_env) <= sizeof(ctx.my_cpu_ctl_env), "FPU settings storage does not fit to uint64_t");
+    d1::cpu_ctl_env* ctl = new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env;
+    if (ctx.my_traits.fp_settings)
+        ctl->get_env();
+}
+
+void task_group_context_impl::register_with(d1::task_group_context& ctx, thread_data* td) {
+    __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
+    __TBB_ASSERT(td, NULL);
+    ctx.my_owner.store(td, std::memory_order_relaxed);
+    thread_data::context_list_state& cls = td->my_context_list_state;
+    // state propagation logic assumes new contexts are bound to head of the list
+    ctx.my_node.prev.store(&cls.head, std::memory_order_relaxed);
+    // Notify threads that may be concurrently destroying contexts registered
+    // in this scheduler's list that local list update is underway.
+    // Prevent load of global propagation epoch counter from being hoisted before
+    // speculative stores above, as well as load of nonlocal update flag from
+    // being hoisted before the store to local update flag.
+    cls.local_update = 1;
+    // Finalize local context list update
+    if (cls.nonlocal_update.load(std::memory_order_relaxed)) {
+        spin_mutex::scoped_lock lock(cls.mutex);
+        d1::context_list_node* head_next = cls.head.next.load(std::memory_order_relaxed);
+        head_next->prev.store(&ctx.my_node, std::memory_order_relaxed);
+        ctx.my_node.next.store(head_next, std::memory_order_relaxed);
+        cls.local_update.store(0, std::memory_order_relaxed);
+        cls.head.next.store(&ctx.my_node, std::memory_order_relaxed);
+    } else {
+        d1::context_list_node* head_next = cls.head.next.load(std::memory_order_relaxed);
+        head_next->prev.store(&ctx.my_node, std::memory_order_relaxed);
+        ctx.my_node.next.store(head_next, std::memory_order_relaxed);
+        cls.local_update.store(0, std::memory_order_release);
+        // Thread-local list of contexts allows concurrent traversal by another thread
+        // while propagating state change. To ensure visibility of ctx.my_node's members
+        // to the concurrently traversing thread, the list's head is updated by means
+        // of store-with-release.
+        cls.head.next.store(&ctx.my_node, std::memory_order_release);
+    }
+}
+
+void task_group_context_impl::bind_to_impl(d1::task_group_context& ctx, thread_data* td) {
+    __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
+    __TBB_ASSERT(ctx.my_lifetime_state.load(std::memory_order_relaxed) == d1::task_group_context::lifetime_state::locked, "The context can be bound only under the lock.");
+    __TBB_ASSERT(!ctx.my_parent, "Parent is set before initial binding");
+
+    ctx.my_parent = td->my_task_dispatcher->m_execute_data_ext.context;
+    __TBB_ASSERT(ctx.my_parent, NULL);
+
+    // Inherit FPU settings only if the context has not captured FPU settings yet.
+    if (!ctx.my_traits.fp_settings)
+        copy_fp_settings(ctx, *ctx.my_parent);
+
+    // Condition below prevents unnecessary thrashing parent context's cache line
+    if (ctx.my_parent->my_state.load(std::memory_order_relaxed) != d1::task_group_context::may_have_children) {
+        ctx.my_parent->my_state.store(d1::task_group_context::may_have_children, std::memory_order_relaxed); // full fence is below
+    }
+    if (ctx.my_parent->my_parent) {
+        // Even if this context were made accessible for state change propagation
+        // (by placing store_with_release(td->my_context_list_state.head.my_next, &ctx.my_node)
+        // above), it still could be missed if state propagation from a grand-ancestor
+        // was underway concurrently with binding.
+        // Speculative propagation from the parent together with epoch counters
+        // detecting possibility of such a race allow to avoid taking locks when
+        // there is no contention.
+
+        // Acquire fence is necessary to prevent reordering subsequent speculative
+        // loads of parent state data out of the scope where epoch counters comparison
+        // can reliably validate it.
+        uintptr_t local_count_snapshot = ctx.my_parent->my_owner.load(std::memory_order_relaxed)->my_context_list_state.epoch.load(std::memory_order_acquire);
+        // Speculative propagation of parent's state. The speculation will be
+        // validated by the epoch counters check further on.
+        ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        register_with(ctx, td); // Issues full fence
+
+        // If no state propagation was detected by the following condition, the above
+        // full fence guarantees that the parent had correct state during speculative
+        // propagation before the fence. Otherwise the propagation from parent is
+        // repeated under the lock.
+        if (local_count_snapshot != the_context_state_propagation_epoch.load(std::memory_order_relaxed)) {
+            // Another thread may be propagating state change right now. So resort to lock.
+            context_state_propagation_mutex_type::scoped_lock lock(the_context_state_propagation_mutex);
+            ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        }
+    } else {
+        register_with(ctx, td); // Issues full fence
+        // As we do not have grand-ancestors, concurrent state propagation (if any)
+        // may originate only from the parent context, and thus it is safe to directly
+        // copy the state from it.
+        ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed);
+    }
+
+    ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::bound, std::memory_order_release);
+}
+
+void task_group_context_impl::bind_to(d1::task_group_context& ctx, thread_data* td) {
+    __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
+    d1::task_group_context::lifetime_state state = ctx.my_lifetime_state.load(std::memory_order_acquire);
+    if (state <= d1::task_group_context::lifetime_state::locked) {
+        if (state == d1::task_group_context::lifetime_state::created &&
+#if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1910
+            ((std::atomic<typename std::underlying_type<d1::task_group_context::lifetime_state>::type>&)ctx.my_lifetime_state).compare_exchange_strong(
+            (typename std::underlying_type<d1::task_group_context::lifetime_state>::type&)state,
+                (typename std::underlying_type<d1::task_group_context::lifetime_state>::type)d1::task_group_context::lifetime_state::locked)
+#else
+            ctx.my_lifetime_state.compare_exchange_strong(state, d1::task_group_context::lifetime_state::locked)
+#endif
+            ) {
+            // If we are in the outermost task dispatch loop of an external thread, then
+            // there is nothing to bind this context to, and we skip the binding part
+            // treating the context as isolated.
+            __TBB_ASSERT(td->my_task_dispatcher->m_execute_data_ext.context != nullptr, nullptr);
+            if (td->my_task_dispatcher->m_execute_data_ext.context == td->my_arena->my_default_ctx || !ctx.my_traits.bound) {
+                if (!ctx.my_traits.fp_settings) {
+                    copy_fp_settings(ctx, *td->my_arena->my_default_ctx);
+                }
+                ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::isolated, std::memory_order_release);
+            } else {
+                bind_to_impl(ctx, td);
+            }
+            ITT_STACK_CREATE(ctx.my_itt_caller);
+        }
+        spin_wait_while_eq(ctx.my_lifetime_state, d1::task_group_context::lifetime_state::locked);
+    }
+    __TBB_ASSERT(ctx.my_lifetime_state.load(std::memory_order_relaxed) != d1::task_group_context::lifetime_state::created, NULL);
+    __TBB_ASSERT(ctx.my_lifetime_state.load(std::memory_order_relaxed) != d1::task_group_context::lifetime_state::locked, NULL);
+}
+
+template <typename T>
+void task_group_context_impl::propagate_task_group_state(d1::task_group_context& ctx, std::atomic<T> d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) {
+    __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
+    if ((ctx.*mptr_state).load(std::memory_order_relaxed) == new_state) {
+        // Nothing to do, whether descending from "src" or not, so no need to scan.
+        // Hopefully this happens often thanks to earlier invocations.
+        // This optimization is enabled by LIFO order in the context lists:
+        // - new contexts are bound to the beginning of lists;
+        // - descendants are newer than ancestors;
+        // - earlier invocations are therefore likely to "paint" long chains.
+    } else if (&ctx == &src) {
+        // This clause is disjunct from the traversal below, which skips src entirely.
+        // Note that src.*mptr_state is not necessarily still equal to new_state (another thread may have changed it again).
+        // Such interference is probably not frequent enough to aim for optimisation by writing new_state again (to make the other thread back down).
+        // Letting the other thread prevail may also be fairer.
+    } else {
+        for (d1::task_group_context* ancestor = ctx.my_parent; ancestor != NULL; ancestor = ancestor->my_parent) {
+            if (ancestor == &src) {
+                for (d1::task_group_context* c = &ctx; c != ancestor; c = c->my_parent)
+                    (c->*mptr_state).store(new_state, std::memory_order_relaxed);
+                break;
+            }
+        }
+    }
+}
+
+bool task_group_context_impl::cancel_group_execution(d1::task_group_context& ctx) {
+    __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
+    __TBB_ASSERT(ctx.my_cancellation_requested.load(std::memory_order_relaxed) <= 1, "The cancellation state can be either 0 or 1");
+    if (ctx.my_cancellation_requested.load(std::memory_order_relaxed) || ctx.my_cancellation_requested.exchange(1)) {
+        // This task group and any descendants have already been canceled.
+        // (A newly added descendant would inherit its parent's ctx.my_cancellation_requested,
+        // not missing out on any cancellation still being propagated, and a context cannot be uncanceled.)
+        return false;
+    }
+    governor::get_thread_data()->my_arena->my_market->propagate_task_group_state(&d1::task_group_context::my_cancellation_requested, ctx, uint32_t(1));
+    return true;
+}
+
+bool task_group_context_impl::is_group_execution_cancelled(const d1::task_group_context& ctx) {
+    return ctx.my_cancellation_requested.load(std::memory_order_relaxed) != 0;
+}
+
+// IMPORTANT: It is assumed that this method is not used concurrently!
+void task_group_context_impl::reset(d1::task_group_context& ctx) {
+    __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
+    //! TODO: Add assertion that this context does not have children
+    // No fences are necessary since this context can be accessed from another thread
+    // only after stealing happened (which means necessary fences were used).
+    if (ctx.my_exception) {
+        ctx.my_exception->destroy();
+        ctx.my_exception = NULL;
+    }
+    ctx.my_cancellation_requested = 0;
+}
+
+// IMPORTANT: It is assumed that this method is not used concurrently!
+void task_group_context_impl::capture_fp_settings(d1::task_group_context& ctx) {
+    __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
+    //! TODO: Add assertion that this context does not have children
+    // No fences are necessary since this context can be accessed from another thread
+    // only after stealing happened (which means necessary fences were used).
+    d1::cpu_ctl_env* ctl = reinterpret_cast<d1::cpu_ctl_env*>(&ctx.my_cpu_ctl_env);
+    if (!ctx.my_traits.fp_settings) {
+        ctl = new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env;
+        ctx.my_traits.fp_settings = true;
+    }
+    ctl->get_env();
+}
+
+void task_group_context_impl::copy_fp_settings(d1::task_group_context& ctx, const d1::task_group_context& src) {
+    __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
+    __TBB_ASSERT(!ctx.my_traits.fp_settings, "The context already has FPU settings.");
+    __TBB_ASSERT(src.my_traits.fp_settings, "The source context does not have FPU settings.");
+
+    const d1::cpu_ctl_env* src_ctl = reinterpret_cast<const d1::cpu_ctl_env*>(&src.my_cpu_ctl_env);
+    new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env(*src_ctl);
+    ctx.my_traits.fp_settings = true;
+}
+
+template <typename T>
+void thread_data::propagate_task_group_state(std::atomic<T> d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) {
+    spin_mutex::scoped_lock lock(my_context_list_state.mutex);
+    // Acquire fence is necessary to ensure that the subsequent node->my_next load
+    // returned the correct value in case it was just inserted in another thread.
+    // The fence also ensures visibility of the correct ctx.my_parent value.
+    d1::context_list_node* node = my_context_list_state.head.next.load(std::memory_order_acquire);
+    while (node != &my_context_list_state.head) {
+        d1::task_group_context& ctx = __TBB_get_object_ref(d1::task_group_context, my_node, node);
+        if ((ctx.*mptr_state).load(std::memory_order_relaxed) != new_state)
+            task_group_context_impl::propagate_task_group_state(ctx, mptr_state, src, new_state);
+        node = node->next.load(std::memory_order_relaxed);
+    }
+    // Sync up local propagation epoch with the global one. Release fence prevents
+    // reordering of possible store to *mptr_state after the sync point.
+    my_context_list_state.epoch.store(the_context_state_propagation_epoch.load(std::memory_order_relaxed), std::memory_order_release);
+}
+
+template <typename T>
+bool market::propagate_task_group_state(std::atomic<T> d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) {
+    if (src.my_state.load(std::memory_order_relaxed) != d1::task_group_context::may_have_children)
+        return true;
+    // The whole propagation algorithm is under the lock in order to ensure correctness
+    // in case of concurrent state changes at the different levels of the context tree.
+    // See comment at the bottom of scheduler.cpp
+    context_state_propagation_mutex_type::scoped_lock lock(the_context_state_propagation_mutex);
+    if ((src.*mptr_state).load(std::memory_order_relaxed) != new_state)
+        // Another thread has concurrently changed the state. Back down.
+        return false;
+    // Advance global state propagation epoch
+    ++the_context_state_propagation_epoch;
+    // Propagate to all workers and external threads and sync up their local epochs with the global one
+    unsigned num_workers = my_first_unused_worker_idx;
+    for (unsigned i = 0; i < num_workers; ++i) {
+        thread_data* td = my_workers[i];
+        // If the worker is only about to be registered, skip it.
+        if (td)
+            td->propagate_task_group_state(mptr_state, src, new_state);
+    }
+    // Propagate to all external threads
+    // The whole propagation sequence is locked, thus no contention is expected
+    for (thread_data_list_type::iterator it = my_masters.begin(); it != my_masters.end(); it++)
+        it->propagate_task_group_state(mptr_state, src, new_state);
+    return true;
+}
+
+/*
+    Comments:
+
+1.  The premise of the cancellation support implementation is that cancellations are
+    not part of the hot path of the program execution. Therefore all changes in its
+    implementation in order to reduce the overhead of the cancellation control flow
+    should be done only in ways that do not increase overhead of the normal execution.
+
+    In general, contexts are used by all threads and their descendants are created in
+    different threads as well. In order to minimize impact of the cross-thread tree
+    maintenance (first of all because of the synchronization), the tree of contexts
+    is split into pieces, each of which is handled by a single thread. Such pieces
+    are represented as lists of contexts, members of which are contexts that were
+    bound to their parents in the given thread.
+
+    The context tree maintenance and cancellation propagation algorithms are designed
+    in such a manner that cross-thread access to a context list will take place only
+    when cancellation signal is sent (by user or when an exception happens), and
+    synchronization is necessary only then. Thus the normal execution flow (without
+    exceptions and cancellation) remains free from any synchronization done on
+    behalf of exception handling and cancellation support.
+
+2.  Consider parallel cancellations at the different levels of the context tree:
+
+        Ctx1 <- Cancelled by Thread1            |- Thread2 started processing
+         |                                      |
+        Ctx2                                    |- Thread1 started processing
+         |                                   T1 |- Thread2 finishes and syncs up local counters
+        Ctx3 <- Cancelled by Thread2            |
+         |                                      |- Ctx5 is bound to Ctx2
+        Ctx4                                    |
+                                             T2 |- Thread1 reaches Ctx2
+
+    Thread-propagator of each cancellation increments global counter. However the thread
+    propagating the cancellation from the outermost context (Thread1) may be the last
+    to finish. Which means that the local counters may be synchronized earlier (by Thread2,
+    at Time1) than it propagated cancellation into Ctx2 (at time Time2). If a new context
+    (Ctx5) is created and bound to Ctx2 between Time1 and Time2, checking its parent only
+    (Ctx2) may result in cancellation request being lost.
+
+    This issue is solved by doing the whole propagation under the lock.
+
+    If we need more concurrency while processing parallel cancellations, we could try
+    the following modification of the propagation algorithm:
+
+    advance global counter and remember it
+    for each thread:
+        scan thread's list of contexts
+    for each thread:
+        sync up its local counter only if the global counter has not been changed
+
+    However this version of the algorithm requires more analysis and verification.
+*/
+
+void __TBB_EXPORTED_FUNC initialize(d1::task_group_context& ctx) {
+    task_group_context_impl::initialize(ctx);
+}
+void __TBB_EXPORTED_FUNC destroy(d1::task_group_context& ctx) {
+    task_group_context_impl::destroy(ctx);
+}
+void __TBB_EXPORTED_FUNC reset(d1::task_group_context& ctx) {
+    task_group_context_impl::reset(ctx);
+}
+bool __TBB_EXPORTED_FUNC cancel_group_execution(d1::task_group_context& ctx) {
+    return task_group_context_impl::cancel_group_execution(ctx);
+}
+bool __TBB_EXPORTED_FUNC is_group_execution_cancelled(d1::task_group_context& ctx) {
+    return task_group_context_impl::is_group_execution_cancelled(ctx);
+}
+void __TBB_EXPORTED_FUNC capture_fp_settings(d1::task_group_context& ctx) {
+    task_group_context_impl::capture_fp_settings(ctx);
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
diff --git a/contrib/libs/tbb/src/tbb/task_stream.h b/contrib/libs/tbb/src/tbb/task_stream.h
new file mode 100644
index 0000000000..f32ef94e80
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/task_stream.h
@@ -0,0 +1,288 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_task_stream_H
+#define _TBB_task_stream_H
+
+//! This file is a possible future replacement for the task_stream class implemented in
+//! task_stream.h. It refactors the code and extends task_stream capabilities by moving lane
+//! management during operations on caller side. Despite the fact that new implementation should not
+//! affect performance of the original task stream, analysis on this subject was not made at the
+//! time it was developed. In addition, it is not clearly seen at the moment that this container
+//! would be suitable for critical tasks due to linear time complexity on its operations.
+
+#include "oneapi/tbb/detail/_utils.h"
+
+#include "oneapi/tbb/spin_mutex.h"
+#include "oneapi/tbb/cache_aligned_allocator.h"
+
+#include "scheduler_common.h"
+#include "misc.h" // for FastRandom
+
+#include <deque>
+#include <climits>
+#include <atomic>
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//! Essentially, this is just a pair of a queue and a mutex to protect the queue.
+/** The reason std::pair is not used is that the code would look less clean
+    if field names were replaced with 'first' and 'second'. **/
+template< typename T, typename mutex_t >
+struct alignas(max_nfs_size) queue_and_mutex {
+    typedef std::deque< T, cache_aligned_allocator<T> > queue_base_t;
+
+    queue_base_t my_queue{};
+    mutex_t      my_mutex{};
+};
+
+using population_t = uintptr_t;
+const population_t one = 1;
+
+inline void set_one_bit( std::atomic<population_t>& dest, int pos ) {
+    __TBB_ASSERT( pos>=0, NULL );
+    __TBB_ASSERT( pos<int(sizeof(population_t)*CHAR_BIT), NULL );
+    dest.fetch_or( one<<pos );
+}
+
+inline void clear_one_bit( std::atomic<population_t>& dest, int pos ) {
+    __TBB_ASSERT( pos>=0, NULL );
+    __TBB_ASSERT( pos<int(sizeof(population_t)*CHAR_BIT), NULL );
+    dest.fetch_and( ~(one<<pos) );
+}
+
+inline bool is_bit_set( population_t val, int pos ) {
+    __TBB_ASSERT( pos>=0, NULL );
+    __TBB_ASSERT( pos<int(sizeof(population_t)*CHAR_BIT), NULL );
+    return (val & (one<<pos)) != 0;
+}
+
+struct random_lane_selector :
+#if __INTEL_COMPILER == 1110 || __INTEL_COMPILER == 1500
+        no_assign
+#else
+        no_copy
+#endif
+{
+    random_lane_selector( FastRandom& random ) : my_random( random ) {}
+    unsigned operator()( unsigned out_of ) const {
+        __TBB_ASSERT( ((out_of-1) & out_of) == 0, "number of lanes is not power of two." );
+        return my_random.get() & (out_of-1);
+    }
+private:
+    FastRandom& my_random;
+};
+
+struct lane_selector_base :
+#if __INTEL_COMPILER == 1110 || __INTEL_COMPILER == 1500
+        no_assign
+#else
+        no_copy
+#endif
+{
+    unsigned& my_previous;
+    lane_selector_base( unsigned& previous ) : my_previous( previous ) {}
+};
+
+struct subsequent_lane_selector : lane_selector_base {
+    subsequent_lane_selector( unsigned& previous ) : lane_selector_base( previous ) {}
+    unsigned operator()( unsigned out_of ) const {
+        __TBB_ASSERT( ((out_of-1) & out_of) == 0, "number of lanes is not power of two." );
+        return (++my_previous &= out_of-1);
+    }
+};
+
+struct preceding_lane_selector : lane_selector_base {
+    preceding_lane_selector( unsigned& previous ) : lane_selector_base( previous ) {}
+    unsigned operator()( unsigned out_of ) const {
+        __TBB_ASSERT( ((out_of-1) & out_of) == 0, "number of lanes is not power of two." );
+        return (--my_previous &= (out_of-1));
+    }
+};
+
+//! Specializes from which side of the underlying container elements are retrieved. Method must be
+//! called under corresponding mutex locked.
+template<task_stream_accessor_type accessor>
+class task_stream_accessor : no_copy {
+protected:
+    using lane_t = queue_and_mutex <d1::task*, spin_mutex>;
+    d1::task* get_item( lane_t::queue_base_t& queue ) {
+        d1::task* result = queue.front();
+        queue.pop_front();
+        return result;
+    }
+};
+
+template<>
+class task_stream_accessor< back_nonnull_accessor > : no_copy {
+protected:
+    using lane_t = queue_and_mutex <d1::task*, spin_mutex>;
+    d1::task* get_item( lane_t::queue_base_t& queue ) {
+        d1::task* result = nullptr;
+        __TBB_ASSERT(!queue.empty(), nullptr);
+        // Isolated task can put zeros in queue see look_specific
+        do {
+            result = queue.back();
+            queue.pop_back();
+        } while ( !result && !queue.empty() );
+
+        __TBB_ASSERT_RELEASE(result, nullptr);
+        return result;
+    }
+};
+
+//! The container for "fairness-oriented" aka "enqueued" tasks.
+template<task_stream_accessor_type accessor>
+class task_stream : public task_stream_accessor< accessor > {
+    using lane_t = typename task_stream_accessor<accessor>::lane_t;
+    std::atomic<population_t> population{};
+    lane_t* lanes{nullptr};
+    unsigned N{};
+
+public:
+    task_stream() = default;
+
+    void initialize( unsigned n_lanes ) {
+        const unsigned max_lanes = sizeof(population_t) * CHAR_BIT;
+
+        N = n_lanes >= max_lanes ? max_lanes : n_lanes > 2 ? 1 << (tbb::detail::log2(n_lanes - 1) + 1) : 2;
+        __TBB_ASSERT( N == max_lanes || (N >= n_lanes && ((N - 1) & N) == 0), "number of lanes miscalculated" );
+        __TBB_ASSERT( N <= sizeof(population_t) * CHAR_BIT, NULL );
+        lanes = static_cast<lane_t*>(cache_aligned_allocate(sizeof(lane_t) * N));
+        for (unsigned i = 0; i < N; ++i) {
+            new (lanes + i) lane_t;
+        }
+        __TBB_ASSERT( !population.load(std::memory_order_relaxed), NULL );
+    }
+
+    ~task_stream() {
+        if (lanes) {
+            for (unsigned i = 0; i < N; ++i) {
+                lanes[i].~lane_t();
+            }
+            cache_aligned_deallocate(lanes);
+        }
+    }
+
+    //! Push a task into a lane. Lane selection is performed by passed functor.
+    template<typename lane_selector_t>
+    void push(d1::task* source, const lane_selector_t& next_lane ) {
+        bool succeed = false;
+        unsigned lane = 0;
+        do {
+            lane = next_lane( /*out_of=*/N );
+            __TBB_ASSERT( lane < N, "Incorrect lane index." );
+        } while( ! (succeed = try_push( source, lane )) );
+    }
+
+    //! Try finding and popping a task using passed functor for lane selection. Last used lane is
+    //! updated inside lane selector.
+    template<typename lane_selector_t>
+    d1::task* pop( const lane_selector_t& next_lane ) {
+        d1::task* popped = NULL;
+        unsigned lane = 0;
+        do {
+            lane = next_lane( /*out_of=*/N );
+            __TBB_ASSERT( lane < N, "Incorrect lane index." );
+        } while( !empty() && !(popped = try_pop( lane )) );
+        return popped;
+    }
+
+    //! Try finding and popping a related task.
+    d1::task* pop_specific( unsigned& last_used_lane, isolation_type isolation ) {
+        d1::task* result = NULL;
+        // Lane selection is round-robin in backward direction.
+        unsigned idx = last_used_lane & (N-1);
+        do {
+            if( is_bit_set( population.load(std::memory_order_relaxed), idx ) ) {
+                lane_t& lane = lanes[idx];
+                spin_mutex::scoped_lock lock;
+                if( lock.try_acquire(lane.my_mutex) && !lane.my_queue.empty() ) {
+                    result = look_specific( lane.my_queue, isolation );
+                    if( lane.my_queue.empty() )
+                        clear_one_bit( population, idx );
+                    if( result )
+                        break;
+                }
+            }
+            idx=(idx-1)&(N-1);
+        } while( !empty() && idx != last_used_lane );
+        last_used_lane = idx;
+        return result;
+    }
+
+    //! Checks existence of a task.
+    bool empty() {
+        return !population.load(std::memory_order_relaxed);
+    }
+
+private:
+    //! Returns true on successful push, otherwise - false.
+    bool try_push(d1::task* source, unsigned lane_idx ) {
+        spin_mutex::scoped_lock lock;
+        if( lock.try_acquire( lanes[lane_idx].my_mutex ) ) {
+            lanes[lane_idx].my_queue.push_back( source );
+            set_one_bit( population, lane_idx ); // TODO: avoid atomic op if the bit is already set
+            return true;
+        }
+        return false;
+    }
+
+    //! Returns pointer to task on successful pop, otherwise - NULL.
+    d1::task* try_pop( unsigned lane_idx ) {
+        if( !is_bit_set( population.load(std::memory_order_relaxed), lane_idx ) )
+            return NULL;
+        d1::task* result = NULL;
+        lane_t& lane = lanes[lane_idx];
+        spin_mutex::scoped_lock lock;
+        if( lock.try_acquire( lane.my_mutex ) && !lane.my_queue.empty() ) {
+            result = this->get_item( lane.my_queue );
+            if( lane.my_queue.empty() )
+                clear_one_bit( population, lane_idx );
+        }
+        return result;
+    }
+
+    // TODO: unify '*_specific' logic with 'pop' methods above
+    d1::task* look_specific( typename lane_t::queue_base_t& queue, isolation_type isolation ) {
+        __TBB_ASSERT( !queue.empty(), NULL );
+        // TODO: add a worst-case performance test and consider an alternative container with better
+        // performance for isolation search.
+        typename lane_t::queue_base_t::iterator curr = queue.end();
+        do {
+            // TODO: consider logic from get_task to simplify the code.
+            d1::task* result = *--curr;
+            if( result && task_accessor::isolation(*result) == isolation ) {
+                if( queue.end() - curr == 1 )
+                    queue.pop_back(); // a little of housekeeping along the way
+                else
+                    *curr = 0;      // grabbing task with the same isolation
+                // TODO: move one of the container's ends instead if the task has been found there
+                return result;
+            }
+        } while( curr != queue.begin() );
+        return NULL;
+    }
+
+}; // task_stream
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_task_stream_H */
diff --git a/contrib/libs/tbb/src/tbb/thread_data.h b/contrib/libs/tbb/src/tbb/thread_data.h
new file mode 100644
index 0000000000..41d4a0cf60
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/thread_data.h
@@ -0,0 +1,273 @@
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_thread_data_H
+#define __TBB_thread_data_H
+
+#include "oneapi/tbb/detail/_task.h"
+#include "oneapi/tbb/task.h"
+
+#include "rml_base.h" // rml::job
+
+#include "scheduler_common.h"
+#include "arena.h"
+#include "concurrent_monitor.h"
+#include "mailbox.h"
+#include "misc.h" // FastRandom
+#include "small_object_pool_impl.h"
+
+#include <atomic>
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class task;
+class arena_slot;
+class task_group_context;
+class task_dispatcher;
+
+//------------------------------------------------------------------------
+// Thread Data
+//------------------------------------------------------------------------
+class thread_data : public ::rml::job
+                  , public intrusive_list_node
+                  , no_copy {
+public:
+    thread_data(unsigned short index, bool is_worker)
+        : my_arena_index{ index }
+        , my_is_worker{ is_worker }
+        , my_task_dispatcher{ nullptr }
+        , my_arena{}
+        , my_arena_slot{}
+        , my_inbox{}
+        , my_random{ this }
+        , my_last_observer{ nullptr }
+        , my_small_object_pool{new (cache_aligned_allocate(sizeof(small_object_pool_impl))) small_object_pool_impl{}}
+        , my_context_list_state{}
+#if __TBB_RESUMABLE_TASKS
+        , my_post_resume_action{ post_resume_action::none }
+        , my_post_resume_arg{nullptr}
+#endif /* __TBB_RESUMABLE_TASKS */
+    {
+        ITT_SYNC_CREATE(&my_context_list_state.mutex, SyncType_Scheduler, SyncObj_ContextsList);
+        my_context_list_state.head.next.store(&my_context_list_state.head, std::memory_order_relaxed);
+        my_context_list_state.head.prev.store(&my_context_list_state.head, std::memory_order_relaxed);
+    }
+
+    ~thread_data() {
+        context_list_cleanup();
+        my_small_object_pool->destroy();
+        poison_pointer(my_task_dispatcher);
+        poison_pointer(my_arena);
+        poison_pointer(my_arena_slot);
+        poison_pointer(my_last_observer);
+        poison_pointer(my_small_object_pool);
+#if __TBB_RESUMABLE_TASKS
+        poison_pointer(my_post_resume_arg);
+#endif /* __TBB_RESUMABLE_TASKS */
+        poison_value(my_context_list_state.epoch);
+        poison_value(my_context_list_state.local_update);
+        poison_value(my_context_list_state.nonlocal_update);
+    }
+
+    void attach_arena(arena& a, std::size_t index);
+    bool is_attached_to(arena*);
+    void attach_task_dispatcher(task_dispatcher&);
+    void detach_task_dispatcher();
+    void context_list_cleanup();
+    template <typename T>
+    void propagate_task_group_state(std::atomic<T> d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state);
+
+    //! Index of the arena slot the scheduler occupies now, or occupied last time
+    unsigned short my_arena_index;
+
+    //! Indicates if the thread is created by RML
+    const bool my_is_worker;
+
+    //! The current task dipsatcher
+    task_dispatcher* my_task_dispatcher;
+
+    //! The arena that I own (if external thread) or am servicing at the moment (if worker)
+    arena* my_arena;
+
+    //! Pointer to the slot in the arena we own at the moment
+    arena_slot* my_arena_slot;
+
+    //! The mailbox (affinity mechanism) the current thread attached to
+    mail_inbox my_inbox;
+
+    //! The random generator
+    FastRandom my_random;
+
+    //! Last observer in the observers list processed on this slot
+    observer_proxy* my_last_observer;
+
+    //! Pool of small object for fast task allocation
+    small_object_pool_impl* my_small_object_pool;
+
+    struct context_list_state {
+        //! Head of the thread specific list of task group contexts.
+        d1::context_list_node head{};
+
+        //! Mutex protecting access to the list of task group contexts.
+        // TODO: check whether it can be deadly preempted and replace by spinning/sleeping mutex
+        spin_mutex mutex{};
+
+        //! Last state propagation epoch known to this thread
+        /** Together with the_context_state_propagation_epoch constitute synchronization protocol
+        that keeps hot path of task group context construction destruction mostly
+        lock-free.
+        When local epoch equals the global one, the state of task group contexts
+        registered with this thread is consistent with that of the task group trees
+        they belong to. **/
+        std::atomic<std::uintptr_t> epoch{};
+
+        //! Flag indicating that a context is being destructed by its owner thread
+        /** Together with my_nonlocal_ctx_list_update constitute synchronization protocol
+        that keeps hot path of context destruction (by the owner thread) mostly
+        lock-free. **/
+        std::atomic<std::uintptr_t> local_update{};
+
+        //! Flag indicating that a context is being destructed by non-owner thread.
+        /** See also my_local_update. **/
+        std::atomic<std::uintptr_t> nonlocal_update{};
+    } my_context_list_state;
+
+#if __TBB_RESUMABLE_TASKS
+    //! The list of possible post resume actions.
+    enum class post_resume_action {
+        invalid,
+        register_waiter,
+        resume,
+        callback,
+        cleanup,
+        notify,
+        none
+    };
+
+    //! The callback to call the user callback passed to tbb::suspend.
+    struct suspend_callback_wrapper {
+        suspend_callback_type suspend_callback;
+        void* user_callback;
+        suspend_point_type* tag;
+
+        void operator()() {
+            __TBB_ASSERT(suspend_callback && user_callback && tag, nullptr);
+            suspend_callback(user_callback, tag);
+        }
+    };
+
+    //! Suspends the current coroutine (task_dispatcher).
+    void suspend(void* suspend_callback, void* user_callback);
+
+    //! Resumes the target task_dispatcher.
+    void resume(task_dispatcher& target);
+
+    //! Set post resume action to perform after resume.
+    void set_post_resume_action(post_resume_action pra, void* arg) {
+        __TBB_ASSERT(my_post_resume_action == post_resume_action::none, "The Post resume action must not be set");
+        __TBB_ASSERT(!my_post_resume_arg, "The post resume action must not have an argument");
+        my_post_resume_action = pra;
+        my_post_resume_arg = arg;
+    }
+
+    void clear_post_resume_action() {
+        my_post_resume_action = thread_data::post_resume_action::none;
+        my_post_resume_arg = nullptr;
+    }
+
+    //! Performs post resume action.
+    void do_post_resume_action();
+
+    //! The post resume action requested after the swap contexts.
+    post_resume_action my_post_resume_action;
+
+    //! The post resume action argument.
+    void* my_post_resume_arg;
+#endif /* __TBB_RESUMABLE_TASKS */
+
+    //! The default context
+    // TODO: consider using common default context because it is used only to simplify
+    // cancellation check.
+    d1::task_group_context my_default_context;
+};
+
+inline void thread_data::attach_arena(arena& a, std::size_t index) {
+    my_arena = &a;
+    my_arena_index = static_cast<unsigned short>(index);
+    my_arena_slot = a.my_slots + index;
+    // Read the current slot mail_outbox and attach it to the mail_inbox (remove inbox later maybe)
+    my_inbox.attach(my_arena->mailbox(index));
+}
+
+inline bool thread_data::is_attached_to(arena* a) { return my_arena == a; }
+
+inline void thread_data::context_list_cleanup() {
+    // Detach contexts remaining in the local list.
+    {
+        spin_mutex::scoped_lock lock(my_context_list_state.mutex);
+        d1::context_list_node* node = my_context_list_state.head.next.load(std::memory_order_relaxed);
+        while (node != &my_context_list_state.head) {
+            using state_t = d1::task_group_context::lifetime_state;
+
+            d1::task_group_context& ctx = __TBB_get_object_ref(d1::task_group_context, my_node, node);
+            std::atomic<state_t>& state = ctx.my_lifetime_state;
+
+            node = node->next.load(std::memory_order_relaxed);
+
+            __TBB_ASSERT(ctx.my_owner == this, "The context should belong to the current thread.");
+            state_t expected = state_t::bound;
+            if (
+#if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1910
+                !((std::atomic<typename std::underlying_type<state_t>::type>&)state).compare_exchange_strong(
+                    (typename std::underlying_type<state_t>::type&)expected,
+                    (typename std::underlying_type<state_t>::type)state_t::detached)
+#else
+                !state.compare_exchange_strong(expected, state_t::detached)
+#endif
+            ) {
+                __TBB_ASSERT(expected == state_t::locked || expected == state_t::dying, nullptr);
+                spin_wait_until_eq(state, state_t::dying);
+            } else {
+                __TBB_ASSERT(expected == state_t::bound, nullptr);
+                ctx.my_owner.store(nullptr, std::memory_order_release);
+            }
+        }
+    }
+    spin_wait_until_eq(my_context_list_state.nonlocal_update, 0u);
+}
+
+inline void thread_data::attach_task_dispatcher(task_dispatcher& task_disp) {
+    __TBB_ASSERT(my_task_dispatcher == nullptr, nullptr);
+    __TBB_ASSERT(task_disp.m_thread_data == nullptr, nullptr);
+    task_disp.m_thread_data = this;
+    my_task_dispatcher = &task_disp;
+}
+
+inline void thread_data::detach_task_dispatcher() {
+    __TBB_ASSERT(my_task_dispatcher != nullptr, nullptr);
+    __TBB_ASSERT(my_task_dispatcher->m_thread_data == this, nullptr);
+    my_task_dispatcher->m_thread_data = nullptr;
+    my_task_dispatcher = nullptr;
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_thread_data_H
+
diff --git a/contrib/libs/tbb/src/tbb/tls.h b/contrib/libs/tbb/src/tbb/tls.h
new file mode 100644
index 0000000000..5d28ca4dae
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/tls.h
@@ -0,0 +1,93 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_tls_H
+#define _TBB_tls_H
+
+#include "oneapi/tbb/detail/_config.h"
+
+#if __TBB_USE_POSIX
+#include <pthread.h>
+#else /* assume __TBB_USE_WINAPI */
+#include <windows.h>
+#endif
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+typedef void (*tls_dtor_t)(void*);
+
+//! Basic cross-platform wrapper class for TLS operations.
+template <typename T>
+class basic_tls {
+#if __TBB_USE_POSIX
+    typedef pthread_key_t tls_key_t;
+public:
+    int  create( tls_dtor_t dtor = NULL ) {
+        return pthread_key_create(&my_key, dtor);
+    }
+    int  destroy()      { return pthread_key_delete(my_key); }
+    void set( T value ) { pthread_setspecific(my_key, (void*)value); }
+    T    get()          { return (T)pthread_getspecific(my_key); }
+#else /* __TBB_USE_WINAPI */
+    typedef DWORD tls_key_t;
+public:
+#if !__TBB_WIN8UI_SUPPORT
+    int create() {
+        tls_key_t tmp = TlsAlloc();
+        if( tmp==TLS_OUT_OF_INDEXES )
+            return TLS_OUT_OF_INDEXES;
+        my_key = tmp;
+        return 0;
+    }
+    int  destroy()      { TlsFree(my_key); my_key=0; return 0; }
+    void set( T value ) { TlsSetValue(my_key, (LPVOID)value); }
+    T    get()          { return (T)TlsGetValue(my_key); }
+#else /*!__TBB_WIN8UI_SUPPORT*/
+    int create() {
+        tls_key_t tmp = FlsAlloc(NULL);
+        if( tmp== (DWORD)0xFFFFFFFF )
+            return (DWORD)0xFFFFFFFF;
+        my_key = tmp;
+        return 0;
+    }
+    int  destroy()      { FlsFree(my_key); my_key=0; return 0; }
+    void set( T value ) { FlsSetValue(my_key, (LPVOID)value); }
+    T    get()          { return (T)FlsGetValue(my_key); }
+#endif /* !__TBB_WIN8UI_SUPPORT */
+#endif /* __TBB_USE_WINAPI */
+private:
+    tls_key_t my_key;
+};
+
+//! More advanced TLS support template class.
+/** It supports RAII and to some extent mimic __declspec(thread) variables. */
+template <typename T>
+class tls : public basic_tls<T> {
+    typedef basic_tls<T> base;
+public:
+    tls()  { base::create();  }
+    ~tls() { base::destroy(); }
+    T operator=(T value) { base::set(value); return value; }
+    operator T() { return base::get(); }
+};
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_tls_H */
diff --git a/contrib/libs/tbb/src/tbb/tools_api/disable_warnings.h b/contrib/libs/tbb/src/tbb/tools_api/disable_warnings.h
new file mode 100644
index 0000000000..e1ba837404
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/tools_api/disable_warnings.h
@@ -0,0 +1,35 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "ittnotify_config.h"
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+
+#pragma warning (disable: 593)   /* parameter "XXXX" was set but never used                 */
+#pragma warning (disable: 344)   /* typedef name has already been declared (with same type) */
+#pragma warning (disable: 174)   /* expression has no effect                                */
+#pragma warning (disable: 4127)  /* conditional expression is constant                      */
+#pragma warning (disable: 4306)  /* conversion from '?' to '?' of greater size              */
+
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if defined __INTEL_COMPILER
+
+#pragma warning (disable: 869)  /* parameter "XXXXX" was never referenced                  */
+#pragma warning (disable: 1418) /* external function definition with no prior declaration  */
+#pragma warning (disable: 1419) /* external declaration in primary source file             */
+
+#endif /* __INTEL_COMPILER */
diff --git a/contrib/libs/tbb/src/tbb/tools_api/ittnotify.h b/contrib/libs/tbb/src/tbb/tools_api/ittnotify.h
new file mode 100644
index 0000000000..993b7b0bfd
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/tools_api/ittnotify.h
@@ -0,0 +1,4165 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _ITTNOTIFY_H_
+#define _ITTNOTIFY_H_
+
+/**
+@file
+@brief Public User API functions and types
+@mainpage
+
+The Instrumentation and Tracing Technology API (ITT API) is used to 
+annotate a user's program with additional information
+that can be used by correctness and performance tools. The user inserts
+calls in their program. Those calls generate information that is collected
+at runtime, and used by Intel(R) Threading Tools.
+
+@section API Concepts
+The following general concepts are used throughout the API.
+
+@subsection Unicode Support
+Many API functions take character string arguments. On Windows, there
+are two versions of each such function. The function name is suffixed
+by W if Unicode support is enabled, and by A otherwise. Any API function
+that takes a character string argument adheres to this convention.
+
+@subsection Conditional Compilation
+Many users prefer having an option to modify ITT API code when linking it
+inside their runtimes. ITT API header file provides a mechanism to replace
+ITT API function names inside your code with empty strings. To do this,
+define the macros INTEL_NO_ITTNOTIFY_API during compilation and remove the
+static library from the linker script.
+
+@subsection Domains
+[see domains]
+Domains provide a way to separate notification for different modules or
+libraries in a program. Domains are specified by dotted character strings,
+e.g. TBB.Internal.Control.
+
+A mechanism (to be specified) is provided to enable and disable
+domains. By default, all domains are enabled.
+@subsection Named Entities and Instances
+Named entities (frames, regions, tasks, and markers) communicate
+information about the program to the analysis tools. A named entity often
+refers to a section of program code, or to some set of logical concepts
+that the programmer wants to group together.
+
+Named entities relate to the programmer's static view of the program. When
+the program actually executes, many instances of a given named entity
+may be created.
+
+The API annotations denote instances of named entities. The actual
+named entities are displayed using the analysis tools. In other words,
+the named entities come into existence when instances are created.
+
+Instances of named entities may have instance identifiers (IDs). Some
+API calls use instance identifiers to create relationships between
+different instances of named entities. Other API calls associate data
+with instances of named entities.
+
+Some named entities must always have instance IDs. In particular, regions
+and frames always have IDs. Task and markers need IDs only if the ID is
+needed in another API call (such as adding a relation or metadata).
+
+The lifetime of instance IDs is distinct from the lifetime of
+instances. This allows various relationships to be specified separate
+from the actual execution of instances. This flexibility comes at the
+expense of extra API calls.
+
+The same ID may not be reused for different instances, unless a previous
+[ref] __itt_id_destroy call for that ID has been issued.
+*/
+
+/** @cond exclude_from_documentation */
+#ifndef ITT_OS_WIN
+#  define ITT_OS_WIN   1
+#endif /* ITT_OS_WIN */
+
+#ifndef ITT_OS_LINUX
+#  define ITT_OS_LINUX 2
+#endif /* ITT_OS_LINUX */
+
+#ifndef ITT_OS_MAC
+#  define ITT_OS_MAC   3
+#endif /* ITT_OS_MAC */
+
+#ifndef ITT_OS_FREEBSD
+#  define ITT_OS_FREEBSD   4
+#endif /* ITT_OS_FREEBSD */
+
+#ifndef ITT_OS
+#  if defined WIN32 || defined _WIN32
+#    define ITT_OS ITT_OS_WIN
+#  elif defined( __APPLE__ ) && defined( __MACH__ )
+#    define ITT_OS ITT_OS_MAC
+#  elif defined( __FreeBSD__ )
+#    define ITT_OS ITT_OS_FREEBSD
+#  else
+#    define ITT_OS ITT_OS_LINUX
+#  endif
+#endif /* ITT_OS */
+
+#ifndef ITT_PLATFORM_WIN
+#  define ITT_PLATFORM_WIN 1
+#endif /* ITT_PLATFORM_WIN */
+
+#ifndef ITT_PLATFORM_POSIX
+#  define ITT_PLATFORM_POSIX 2
+#endif /* ITT_PLATFORM_POSIX */
+
+#ifndef ITT_PLATFORM_MAC
+#  define ITT_PLATFORM_MAC 3
+#endif /* ITT_PLATFORM_MAC */
+
+#ifndef ITT_PLATFORM_FREEBSD
+#  define ITT_PLATFORM_FREEBSD 4
+#endif /* ITT_PLATFORM_FREEBSD */
+
+#ifndef ITT_PLATFORM
+#  if ITT_OS==ITT_OS_WIN
+#    define ITT_PLATFORM ITT_PLATFORM_WIN
+#  elif ITT_OS==ITT_OS_MAC
+#    define ITT_PLATFORM ITT_PLATFORM_MAC
+#  elif ITT_OS==ITT_OS_FREEBSD
+#    define ITT_PLATFORM ITT_PLATFORM_FREEBSD
+#  else
+#    define ITT_PLATFORM ITT_PLATFORM_POSIX
+#  endif
+#endif /* ITT_PLATFORM */
+
+#if defined(_UNICODE) && !defined(UNICODE)
+#define UNICODE
+#endif
+
+#include <stddef.h>
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#include <tchar.h>
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <stdint.h>
+#if defined(UNICODE) || defined(_UNICODE)
+#include <wchar.h>
+#endif /* UNICODE || _UNICODE */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef ITTAPI_CDECL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define ITTAPI_CDECL __cdecl
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__
+#      define ITTAPI_CDECL __attribute__ ((cdecl))
+#    else  /* _M_IX86 || __i386__ */
+#      define ITTAPI_CDECL /* actual only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* ITTAPI_CDECL */
+
+#ifndef STDCALL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define STDCALL __stdcall
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__
+#      define STDCALL __attribute__ ((stdcall))
+#    else  /* _M_IX86 || __i386__ */
+#      define STDCALL /* supported only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* STDCALL */
+
+#define ITTAPI    ITTAPI_CDECL
+#define LIBITTAPI ITTAPI_CDECL
+
+/* TODO: Temporary for compatibility! */
+#define ITTAPI_CALL    ITTAPI_CDECL
+#define LIBITTAPI_CALL ITTAPI_CDECL
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+/* use __forceinline (VC++ specific) */
+#define ITT_INLINE           __forceinline
+#define ITT_INLINE_ATTRIBUTE /* nothing */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/*
+ * Generally, functions are not inlined unless optimization is specified.
+ * For functions declared inline, this attribute inlines the function even
+ * if no optimization level was specified.
+ */
+#ifdef __STRICT_ANSI__
+#define ITT_INLINE           static
+#define ITT_INLINE_ATTRIBUTE __attribute__((unused))
+#else  /* __STRICT_ANSI__ */
+#define ITT_INLINE           static inline
+#define ITT_INLINE_ATTRIBUTE __attribute__((always_inline, unused))
+#endif /* __STRICT_ANSI__ */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/** @endcond */
+
+#ifdef INTEL_ITTNOTIFY_ENABLE_LEGACY
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    pragma message("WARNING!!! Deprecated API is used. Please undefine INTEL_ITTNOTIFY_ENABLE_LEGACY macro")
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+// #warning usage leads to ICC's compilation error
+// #    warning "Deprecated API is used. Please undefine INTEL_ITTNOTIFY_ENABLE_LEGACY macro"
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#  include "legacy/ittnotify.h"
+#endif /* INTEL_ITTNOTIFY_ENABLE_LEGACY */
+
+/** @cond exclude_from_documentation */
+/* Helper macro for joining tokens */
+#define ITT_JOIN_AUX(p,n) p##n
+#define ITT_JOIN(p,n)     ITT_JOIN_AUX(p,n)
+
+#ifdef ITT_MAJOR
+#undef ITT_MAJOR
+#endif
+#ifdef ITT_MINOR
+#undef ITT_MINOR
+#endif
+#define ITT_MAJOR     3
+#define ITT_MINOR     0
+
+/* Standard versioning of a token with major and minor version numbers */
+#define ITT_VERSIONIZE(x)    \
+    ITT_JOIN(x,              \
+    ITT_JOIN(_,              \
+    ITT_JOIN(ITT_MAJOR,      \
+    ITT_JOIN(_, ITT_MINOR))))
+
+#ifndef INTEL_ITTNOTIFY_PREFIX
+#  define INTEL_ITTNOTIFY_PREFIX __itt_
+#endif /* INTEL_ITTNOTIFY_PREFIX */
+#ifndef INTEL_ITTNOTIFY_POSTFIX
+#  define INTEL_ITTNOTIFY_POSTFIX _ptr_
+#endif /* INTEL_ITTNOTIFY_POSTFIX */
+
+#define ITTNOTIFY_NAME_AUX(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX,n)
+#define ITTNOTIFY_NAME(n)     ITT_VERSIONIZE(ITTNOTIFY_NAME_AUX(ITT_JOIN(n,INTEL_ITTNOTIFY_POSTFIX)))
+
+#define ITTNOTIFY_VOID(n) (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)
+#define ITTNOTIFY_DATA(n) (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)
+
+#define ITTNOTIFY_VOID_D0(n,d)       (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_VOID_D1(n,d,x)     (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x)
+#define ITTNOTIFY_VOID_D2(n,d,x,y)   (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y)
+#define ITTNOTIFY_VOID_D3(n,d,x,y,z) (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z)
+#define ITTNOTIFY_VOID_D4(n,d,x,y,z,a)     (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
+#define ITTNOTIFY_VOID_D5(n,d,x,y,z,a,b)   (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
+#define ITTNOTIFY_VOID_D6(n,d,x,y,z,a,b,c) (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+#define ITTNOTIFY_DATA_D0(n,d)       (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_DATA_D1(n,d,x)     (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x)
+#define ITTNOTIFY_DATA_D2(n,d,x,y)   (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y)
+#define ITTNOTIFY_DATA_D3(n,d,x,y,z) (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z)
+#define ITTNOTIFY_DATA_D4(n,d,x,y,z,a)     (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
+#define ITTNOTIFY_DATA_D5(n,d,x,y,z,a,b)   (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
+#define ITTNOTIFY_DATA_D6(n,d,x,y,z,a,b,c) (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+
+#ifdef ITT_STUB
+#undef ITT_STUB
+#endif
+#ifdef ITT_STUBV
+#undef ITT_STUBV
+#endif
+#define ITT_STUBV(api,type,name,args)                             \
+    typedef type (api* ITT_JOIN(ITTNOTIFY_NAME(name),_t)) args;   \
+    extern ITT_JOIN(ITTNOTIFY_NAME(name),_t) ITTNOTIFY_NAME(name);
+#define ITT_STUB ITT_STUBV
+/** @endcond */
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/** @cond exclude_from_gpa_documentation */
+/**
+ * @defgroup public Public API
+ * @{
+ * @}
+ */
+
+/**
+ * @defgroup control Collection Control
+ * @ingroup public
+ * General behavior: application continues to run, but no profiling information is being collected
+ *
+ * Pausing occurs not only for the current thread but for all process as well as spawned processes
+ * - Intel(R) Parallel Inspector and Intel(R) Inspector XE:
+ *   - Does not analyze or report errors that involve memory access.
+ *   - Other errors are reported as usual. Pausing data collection in
+ *     Intel(R) Parallel Inspector and Intel(R) Inspector XE
+ *     only pauses tracing and analyzing memory access.
+ *     It does not pause tracing or analyzing threading APIs.
+ *   .
+ * - Intel(R) Parallel Amplifier and Intel(R) VTune(TM) Amplifier XE:
+ *   - Does continue to record when new threads are started.
+ *   .
+ * - Other effects:
+ *   - Possible reduction of runtime overhead.
+ *   .
+ * @{
+ */
+/** @brief Pause collection */
+void ITTAPI __itt_pause(void);
+/** @brief Resume collection */
+void ITTAPI __itt_resume(void);
+/** @brief Detach collection */
+void ITTAPI __itt_detach(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, pause,  (void))
+ITT_STUBV(ITTAPI, void, resume, (void))
+ITT_STUBV(ITTAPI, void, detach, (void))
+#define __itt_pause      ITTNOTIFY_VOID(pause)
+#define __itt_pause_ptr  ITTNOTIFY_NAME(pause)
+#define __itt_resume     ITTNOTIFY_VOID(resume)
+#define __itt_resume_ptr ITTNOTIFY_NAME(resume)
+#define __itt_detach     ITTNOTIFY_VOID(detach)
+#define __itt_detach_ptr ITTNOTIFY_NAME(detach)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_pause()
+#define __itt_pause_ptr  0
+#define __itt_resume()
+#define __itt_resume_ptr 0
+#define __itt_detach()
+#define __itt_detach_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_pause_ptr  0
+#define __itt_resume_ptr 0
+#define __itt_detach_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} control group */
+/** @endcond */
+
+/**
+ * @defgroup Intel Processor Trace control
+ * API from this group provides control over collection and analysis of Intel Processor Trace (Intel PT) data
+ * Information about Intel Processor Trace technology can be found here (Volume 3 chapter 35):
+ * https://software.intel.com/sites/default/files/managed/39/c5/325462-sdm-vol-1-2abcd-3abcd.pdf
+ * Use this API to mark particular code regions for loading detailed performance statistics.
+ * This mode makes your analysis faster and more accurate.
+ * @{
+*/
+typedef unsigned char __itt_pt_region;
+
+/**
+ * @brief function saves a region name marked with Intel PT API and returns a region id.
+ * Only 7 names can be registered. Attempts to register more names will be ignored and a region id with auto names will be returned.
+ * For automatic naming of regions pass NULL as function parameter
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_pt_region ITTAPI __itt_pt_region_createA(const char    *name);
+__itt_pt_region ITTAPI __itt_pt_region_createW(const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_pt_region_create __itt_pt_region_createW
+#else /* UNICODE */
+#  define __itt_pt_region_create __itt_pt_region_createA
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_pt_region ITTAPI __itt_pt_region_create(const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_pt_region, pt_region_createA, (const char    *name))
+ITT_STUB(ITTAPI, __itt_pt_region, pt_region_createW, (const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_pt_region, pt_region_create,  (const char    *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_pt_region_createA     ITTNOTIFY_DATA(pt_region_createA)
+#define __itt_pt_region_createA_ptr ITTNOTIFY_NAME(pt_region_createA)
+#define __itt_pt_region_createW     ITTNOTIFY_DATA(pt_region_createW)
+#define __itt_pt_region_createW_ptr ITTNOTIFY_NAME(pt_region_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_pt_region_create     ITTNOTIFY_DATA(pt_region_create)
+#define __itt_pt_region_create_ptr ITTNOTIFY_NAME(pt_region_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_pt_region_createA(name) (__itt_pt_region)0
+#define __itt_pt_region_createA_ptr 0
+#define __itt_pt_region_createW(name) (__itt_pt_region)0
+#define __itt_pt_region_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_pt_region_create(name)  (__itt_pt_region)0
+#define __itt_pt_region_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_pt_region_createA_ptr 0
+#define __itt_pt_region_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_pt_region_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief function contains a special code pattern identified on the post-processing stage and
+ * marks the beginning of a code region targeted for Intel PT analysis
+ * @param[in] region - region id, 0 <= region < 8
+*/
+void __itt_mark_pt_region_begin(__itt_pt_region region);
+/**
+ * @brief function contains a special code pattern identified on the post-processing stage and
+ * marks the end of a code region targeted for Intel PT analysis
+ * @param[in] region - region id, 0 <= region < 8
+*/
+void __itt_mark_pt_region_end(__itt_pt_region region);
+/** @} Intel PT control group*/
+
+/**
+ * @defgroup threads Threads
+ * @ingroup public
+ * Give names to threads
+ * @{
+ */
+/**
+ * @brief Sets thread name of calling thread
+ * @param[in] name - name of thread
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_thread_set_nameA(const char    *name);
+void ITTAPI __itt_thread_set_nameW(const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_thread_set_name     __itt_thread_set_nameW
+#  define __itt_thread_set_name_ptr __itt_thread_set_nameW_ptr
+#else /* UNICODE */
+#  define __itt_thread_set_name     __itt_thread_set_nameA
+#  define __itt_thread_set_name_ptr __itt_thread_set_nameA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_thread_set_name(const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, thread_set_nameA, (const char    *name))
+ITT_STUBV(ITTAPI, void, thread_set_nameW, (const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, thread_set_name,  (const char    *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thread_set_nameA     ITTNOTIFY_VOID(thread_set_nameA)
+#define __itt_thread_set_nameA_ptr ITTNOTIFY_NAME(thread_set_nameA)
+#define __itt_thread_set_nameW     ITTNOTIFY_VOID(thread_set_nameW)
+#define __itt_thread_set_nameW_ptr ITTNOTIFY_NAME(thread_set_nameW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thread_set_name     ITTNOTIFY_VOID(thread_set_name)
+#define __itt_thread_set_name_ptr ITTNOTIFY_NAME(thread_set_name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thread_set_nameA(name)
+#define __itt_thread_set_nameA_ptr 0
+#define __itt_thread_set_nameW(name)
+#define __itt_thread_set_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thread_set_name(name)
+#define __itt_thread_set_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thread_set_nameA_ptr 0
+#define __itt_thread_set_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thread_set_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @cond exclude_from_gpa_documentation */
+
+/**
+ * @brief Mark current thread as ignored from this point on, for the duration of its existence.
+ */
+void ITTAPI __itt_thread_ignore(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, thread_ignore, (void))
+#define __itt_thread_ignore     ITTNOTIFY_VOID(thread_ignore)
+#define __itt_thread_ignore_ptr ITTNOTIFY_NAME(thread_ignore)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_thread_ignore()
+#define __itt_thread_ignore_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_thread_ignore_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} threads group */
+
+/**
+ * @defgroup suppress Error suppression
+ * @ingroup public
+ * General behavior: application continues to run, but errors are suppressed
+ *
+ * @{
+ */
+
+/*****************************************************************//**
+ * @name group of functions used for error suppression in correctness tools
+ *********************************************************************/
+/** @{ */
+/**
+ * @hideinitializer
+ * @brief possible value for suppression mask
+ */
+#define __itt_suppress_all_errors 0x7fffffff
+
+/**
+ * @hideinitializer
+ * @brief possible value for suppression mask (suppresses errors from threading analysis)
+ */
+#define __itt_suppress_threading_errors 0x000000ff
+
+/**
+ * @hideinitializer
+ * @brief possible value for suppression mask (suppresses errors from memory analysis)
+ */
+#define __itt_suppress_memory_errors 0x0000ff00
+
+/**
+ * @brief Start suppressing errors identified in mask on this thread
+ */
+void ITTAPI __itt_suppress_push(unsigned int mask);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, suppress_push, (unsigned int mask))
+#define __itt_suppress_push     ITTNOTIFY_VOID(suppress_push)
+#define __itt_suppress_push_ptr ITTNOTIFY_NAME(suppress_push)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_suppress_push(mask)
+#define __itt_suppress_push_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_suppress_push_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Undo the effects of the matching call to __itt_suppress_push
+ */
+void ITTAPI __itt_suppress_pop(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, suppress_pop, (void))
+#define __itt_suppress_pop     ITTNOTIFY_VOID(suppress_pop)
+#define __itt_suppress_pop_ptr ITTNOTIFY_NAME(suppress_pop)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_suppress_pop()
+#define __itt_suppress_pop_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_suppress_pop_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @enum __itt_model_disable
+ * @brief Enumerator for the disable methods
+ */
+typedef enum __itt_suppress_mode {
+    __itt_unsuppress_range,
+    __itt_suppress_range
+} __itt_suppress_mode_t;
+
+/**
+ * @brief Mark a range of memory for error suppression or unsuppression for error types included in mask
+ */
+void ITTAPI __itt_suppress_mark_range(__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, suppress_mark_range, (__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size))
+#define __itt_suppress_mark_range     ITTNOTIFY_VOID(suppress_mark_range)
+#define __itt_suppress_mark_range_ptr ITTNOTIFY_NAME(suppress_mark_range)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_suppress_mark_range(mask)
+#define __itt_suppress_mark_range_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_suppress_mark_range_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Undo the effect of a matching call to __itt_suppress_mark_range.   If not matching
+ *        call is found, nothing is changed.
+ */
+void ITTAPI __itt_suppress_clear_range(__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, suppress_clear_range, (__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size))
+#define __itt_suppress_clear_range     ITTNOTIFY_VOID(suppress_clear_range)
+#define __itt_suppress_clear_range_ptr ITTNOTIFY_NAME(suppress_clear_range)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_suppress_clear_range(mask)
+#define __itt_suppress_clear_range_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_suppress_clear_range_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} */
+/** @} suppress group */
+
+/**
+ * @defgroup sync Synchronization
+ * @ingroup public
+ * Indicate user-written synchronization code
+ * @{
+ */
+/**
+ * @hideinitializer
+ * @brief possible value of attribute argument for sync object type
+ */
+#define __itt_attr_barrier 1
+
+/**
+ * @hideinitializer
+ * @brief possible value of attribute argument for sync object type
+ */
+#define __itt_attr_mutex   2
+
+/**
+@brief Name a synchronization object
+@param[in] addr       Handle for the synchronization object. You should
+use a real address to uniquely identify the synchronization object.
+@param[in] objtype    null-terminated object type string. If NULL is
+passed, the name will be "User Synchronization".
+@param[in] objname    null-terminated object name string. If NULL,
+no name will be assigned to the object.
+@param[in] attribute  one of [#__itt_attr_barrier, #__itt_attr_mutex]
+ */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_sync_createA(void *addr, const char    *objtype, const char    *objname, int attribute);
+void ITTAPI __itt_sync_createW(void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_sync_create     __itt_sync_createW
+#  define __itt_sync_create_ptr __itt_sync_createW_ptr
+#else /* UNICODE */
+#  define __itt_sync_create     __itt_sync_createA
+#  define __itt_sync_create_ptr __itt_sync_createA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_sync_create (void *addr, const char *objtype, const char *objname, int attribute);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_createA, (void *addr, const char    *objtype, const char    *objname, int attribute))
+ITT_STUBV(ITTAPI, void, sync_createW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_create,  (void *addr, const char*    objtype, const char*    objname, int attribute))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_createA     ITTNOTIFY_VOID(sync_createA)
+#define __itt_sync_createA_ptr ITTNOTIFY_NAME(sync_createA)
+#define __itt_sync_createW     ITTNOTIFY_VOID(sync_createW)
+#define __itt_sync_createW_ptr ITTNOTIFY_NAME(sync_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_create     ITTNOTIFY_VOID(sync_create)
+#define __itt_sync_create_ptr ITTNOTIFY_NAME(sync_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_createA(addr, objtype, objname, attribute)
+#define __itt_sync_createA_ptr 0
+#define __itt_sync_createW(addr, objtype, objname, attribute)
+#define __itt_sync_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_create(addr, objtype, objname, attribute)
+#define __itt_sync_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_createA_ptr 0
+#define __itt_sync_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+@brief Rename a synchronization object
+
+You can use the rename call to assign or reassign a name to a given
+synchronization object.
+@param[in] addr  handle for the synchronization object.
+@param[in] name  null-terminated object name string.
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_sync_renameA(void *addr, const char    *name);
+void ITTAPI __itt_sync_renameW(void *addr, const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_sync_rename     __itt_sync_renameW
+#  define __itt_sync_rename_ptr __itt_sync_renameW_ptr
+#else /* UNICODE */
+#  define __itt_sync_rename     __itt_sync_renameA
+#  define __itt_sync_rename_ptr __itt_sync_renameA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_sync_rename(void *addr, const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_renameA, (void *addr, const char    *name))
+ITT_STUBV(ITTAPI, void, sync_renameW, (void *addr, const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_rename,  (void *addr, const char    *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_renameA     ITTNOTIFY_VOID(sync_renameA)
+#define __itt_sync_renameA_ptr ITTNOTIFY_NAME(sync_renameA)
+#define __itt_sync_renameW     ITTNOTIFY_VOID(sync_renameW)
+#define __itt_sync_renameW_ptr ITTNOTIFY_NAME(sync_renameW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_rename     ITTNOTIFY_VOID(sync_rename)
+#define __itt_sync_rename_ptr ITTNOTIFY_NAME(sync_rename)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_renameA(addr, name)
+#define __itt_sync_renameA_ptr 0
+#define __itt_sync_renameW(addr, name)
+#define __itt_sync_renameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_rename(addr, name)
+#define __itt_sync_rename_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_renameA_ptr 0
+#define __itt_sync_renameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_rename_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ @brief Destroy a synchronization object.
+ @param addr Handle for the synchronization object.
+ */
+void ITTAPI __itt_sync_destroy(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_destroy, (void *addr))
+#define __itt_sync_destroy     ITTNOTIFY_VOID(sync_destroy)
+#define __itt_sync_destroy_ptr ITTNOTIFY_NAME(sync_destroy)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_destroy(addr)
+#define __itt_sync_destroy_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_destroy_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/*****************************************************************//**
+ * @name group of functions is used for performance measurement tools
+ *********************************************************************/
+/** @{ */
+/**
+ * @brief Enter spin loop on user-defined sync object
+ */
+void ITTAPI __itt_sync_prepare(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_prepare, (void *addr))
+#define __itt_sync_prepare     ITTNOTIFY_VOID(sync_prepare)
+#define __itt_sync_prepare_ptr ITTNOTIFY_NAME(sync_prepare)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_prepare(addr)
+#define __itt_sync_prepare_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_prepare_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Quit spin loop without acquiring spin object
+ */
+void ITTAPI __itt_sync_cancel(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_cancel, (void *addr))
+#define __itt_sync_cancel     ITTNOTIFY_VOID(sync_cancel)
+#define __itt_sync_cancel_ptr ITTNOTIFY_NAME(sync_cancel)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_cancel(addr)
+#define __itt_sync_cancel_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_cancel_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Successful spin loop completion (sync object acquired)
+ */
+void ITTAPI __itt_sync_acquired(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_acquired, (void *addr))
+#define __itt_sync_acquired     ITTNOTIFY_VOID(sync_acquired)
+#define __itt_sync_acquired_ptr ITTNOTIFY_NAME(sync_acquired)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_acquired(addr)
+#define __itt_sync_acquired_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_acquired_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Start sync object releasing code. Is called before the lock release call.
+ */
+void ITTAPI __itt_sync_releasing(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_releasing, (void *addr))
+#define __itt_sync_releasing     ITTNOTIFY_VOID(sync_releasing)
+#define __itt_sync_releasing_ptr ITTNOTIFY_NAME(sync_releasing)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_releasing(addr)
+#define __itt_sync_releasing_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_releasing_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} */
+
+/** @} sync group */
+
+/**************************************************************//**
+ * @name group of functions is used for correctness checking tools
+ ******************************************************************/
+/** @{ */
+/**
+ * @ingroup legacy
+ * @deprecated Legacy API
+ * @brief Fast synchronization which does no require spinning.
+ * - This special function is to be used by TBB and OpenMP libraries only when they know
+ *   there is no spin but they need to suppress TC warnings about shared variable modifications.
+ * - It only has corresponding pointers in static library and does not have corresponding function
+ *   in dynamic library.
+ * @see void __itt_sync_prepare(void* addr);
+ */
+void ITTAPI __itt_fsync_prepare(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, fsync_prepare, (void *addr))
+#define __itt_fsync_prepare     ITTNOTIFY_VOID(fsync_prepare)
+#define __itt_fsync_prepare_ptr ITTNOTIFY_NAME(fsync_prepare)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_fsync_prepare(addr)
+#define __itt_fsync_prepare_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_fsync_prepare_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup legacy
+ * @deprecated Legacy API
+ * @brief Fast synchronization which does no require spinning.
+ * - This special function is to be used by TBB and OpenMP libraries only when they know
+ *   there is no spin but they need to suppress TC warnings about shared variable modifications.
+ * - It only has corresponding pointers in static library and does not have corresponding function
+ *   in dynamic library.
+ * @see void __itt_sync_cancel(void *addr);
+ */
+void ITTAPI __itt_fsync_cancel(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, fsync_cancel, (void *addr))
+#define __itt_fsync_cancel     ITTNOTIFY_VOID(fsync_cancel)
+#define __itt_fsync_cancel_ptr ITTNOTIFY_NAME(fsync_cancel)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_fsync_cancel(addr)
+#define __itt_fsync_cancel_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_fsync_cancel_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup legacy
+ * @deprecated Legacy API
+ * @brief Fast synchronization which does no require spinning.
+ * - This special function is to be used by TBB and OpenMP libraries only when they know
+ *   there is no spin but they need to suppress TC warnings about shared variable modifications.
+ * - It only has corresponding pointers in static library and does not have corresponding function
+ *   in dynamic library.
+ * @see void __itt_sync_acquired(void *addr);
+ */
+void ITTAPI __itt_fsync_acquired(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, fsync_acquired, (void *addr))
+#define __itt_fsync_acquired     ITTNOTIFY_VOID(fsync_acquired)
+#define __itt_fsync_acquired_ptr ITTNOTIFY_NAME(fsync_acquired)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_fsync_acquired(addr)
+#define __itt_fsync_acquired_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_fsync_acquired_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup legacy
+ * @deprecated Legacy API
+ * @brief Fast synchronization which does no require spinning.
+ * - This special function is to be used by TBB and OpenMP libraries only when they know
+ *   there is no spin but they need to suppress TC warnings about shared variable modifications.
+ * - It only has corresponding pointers in static library and does not have corresponding function
+ *   in dynamic library.
+ * @see void __itt_sync_releasing(void* addr);
+ */
+void ITTAPI __itt_fsync_releasing(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, fsync_releasing, (void *addr))
+#define __itt_fsync_releasing     ITTNOTIFY_VOID(fsync_releasing)
+#define __itt_fsync_releasing_ptr ITTNOTIFY_NAME(fsync_releasing)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_fsync_releasing(addr)
+#define __itt_fsync_releasing_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_fsync_releasing_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} */
+
+/**
+ * @defgroup model Modeling by Intel(R) Parallel Advisor
+ * @ingroup public
+ * This is the subset of itt used for modeling by Intel(R) Parallel Advisor.
+ * This API is called ONLY using annotate.h, by "Annotation" macros
+ * the user places in their sources during the parallelism modeling steps.
+ *
+ * site_begin/end and task_begin/end take the address of handle variables,
+ * which are writeable by the API.  Handles must be 0 initialized prior
+ * to the first call to begin, or may cause a run-time failure.
+ * The handles are initialized in a multi-thread safe way by the API if
+ * the handle is 0.  The commonly expected idiom is one static handle to
+ * identify a site or task.  If a site or task of the same name has already
+ * been started during this collection, the same handle MAY be returned,
+ * but is not required to be - it is unspecified if data merging is done
+ * based on name.  These routines also take an instance variable.  Like
+ * the lexical instance, these must be 0 initialized.  Unlike the lexical
+ * instance, this is used to track a single dynamic instance.
+ *
+ * API used by the Intel(R) Parallel Advisor to describe potential concurrency
+ * and related activities. User-added source annotations expand to calls
+ * to these procedures to enable modeling of a hypothetical concurrent
+ * execution serially.
+ * @{
+ */
+#if !defined(_ADVISOR_ANNOTATE_H_) || defined(ANNOTATE_EXPAND_NULL)
+
+typedef void* __itt_model_site;             /*!< @brief handle for lexical site     */
+typedef void* __itt_model_site_instance;    /*!< @brief handle for dynamic instance */
+typedef void* __itt_model_task;             /*!< @brief handle for lexical site     */
+typedef void* __itt_model_task_instance;    /*!< @brief handle for dynamic instance */
+
+/**
+ * @enum __itt_model_disable
+ * @brief Enumerator for the disable methods
+ */
+typedef enum {
+    __itt_model_disable_observation,
+    __itt_model_disable_collection
+} __itt_model_disable;
+
+#endif /* !_ADVISOR_ANNOTATE_H_ || ANNOTATE_EXPAND_NULL */
+
+/**
+ * @brief ANNOTATE_SITE_BEGIN/ANNOTATE_SITE_END support.
+ *
+ * site_begin/end model a potential concurrency site.
+ * site instances may be recursively nested with themselves.
+ * site_end exits the most recently started but unended site for the current
+ * thread.  The handle passed to end may be used to validate structure.
+ * Instances of a site encountered on different threads concurrently
+ * are considered completely distinct. If the site name for two different
+ * lexical sites match, it is unspecified whether they are treated as the
+ * same or different for data presentation.
+ */
+void ITTAPI __itt_model_site_begin(__itt_model_site *site, __itt_model_site_instance *instance, const char *name);
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_model_site_beginW(const wchar_t *name);
+#endif
+void ITTAPI __itt_model_site_beginA(const char *name);
+void ITTAPI __itt_model_site_beginAL(const char *name, size_t siteNameLen);
+void ITTAPI __itt_model_site_end  (__itt_model_site *site, __itt_model_site_instance *instance);
+void ITTAPI __itt_model_site_end_2(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_site_begin,  (__itt_model_site *site, __itt_model_site_instance *instance, const char *name))
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, model_site_beginW,  (const wchar_t *name))
+#endif
+ITT_STUBV(ITTAPI, void, model_site_beginA,  (const char *name))
+ITT_STUBV(ITTAPI, void, model_site_beginAL,  (const char *name, size_t siteNameLen))
+ITT_STUBV(ITTAPI, void, model_site_end,    (__itt_model_site *site, __itt_model_site_instance *instance))
+ITT_STUBV(ITTAPI, void, model_site_end_2,  (void))
+#define __itt_model_site_begin      ITTNOTIFY_VOID(model_site_begin)
+#define __itt_model_site_begin_ptr  ITTNOTIFY_NAME(model_site_begin)
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_site_beginW      ITTNOTIFY_VOID(model_site_beginW)
+#define __itt_model_site_beginW_ptr  ITTNOTIFY_NAME(model_site_beginW)
+#endif
+#define __itt_model_site_beginA      ITTNOTIFY_VOID(model_site_beginA)
+#define __itt_model_site_beginA_ptr  ITTNOTIFY_NAME(model_site_beginA)
+#define __itt_model_site_beginAL      ITTNOTIFY_VOID(model_site_beginAL)
+#define __itt_model_site_beginAL_ptr  ITTNOTIFY_NAME(model_site_beginAL)
+#define __itt_model_site_end        ITTNOTIFY_VOID(model_site_end)
+#define __itt_model_site_end_ptr    ITTNOTIFY_NAME(model_site_end)
+#define __itt_model_site_end_2        ITTNOTIFY_VOID(model_site_end_2)
+#define __itt_model_site_end_2_ptr    ITTNOTIFY_NAME(model_site_end_2)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_site_begin(site, instance, name)
+#define __itt_model_site_begin_ptr  0
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_site_beginW(name)
+#define __itt_model_site_beginW_ptr  0
+#endif
+#define __itt_model_site_beginA(name)
+#define __itt_model_site_beginA_ptr  0
+#define __itt_model_site_beginAL(name, siteNameLen)
+#define __itt_model_site_beginAL_ptr  0
+#define __itt_model_site_end(site, instance)
+#define __itt_model_site_end_ptr    0
+#define __itt_model_site_end_2()
+#define __itt_model_site_end_2_ptr    0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_site_begin_ptr  0
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_site_beginW_ptr  0
+#endif
+#define __itt_model_site_beginA_ptr  0
+#define __itt_model_site_beginAL_ptr  0
+#define __itt_model_site_end_ptr    0
+#define __itt_model_site_end_2_ptr    0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_TASK_BEGIN/ANNOTATE_TASK_END support
+ *
+ * task_begin/end model a potential task, which is contained within the most
+ * closely enclosing dynamic site.  task_end exits the most recently started
+ * but unended task.  The handle passed to end may be used to validate
+ * structure.  It is unspecified if bad dynamic nesting is detected.  If it
+ * is, it should be encoded in the resulting data collection.  The collector
+ * should not fail due to construct nesting issues, nor attempt to directly
+ * indicate the problem.
+ */
+void ITTAPI __itt_model_task_begin(__itt_model_task *task, __itt_model_task_instance *instance, const char *name);
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_model_task_beginW(const wchar_t *name);
+void ITTAPI __itt_model_iteration_taskW(const wchar_t *name);
+#endif
+void ITTAPI __itt_model_task_beginA(const char *name);
+void ITTAPI __itt_model_task_beginAL(const char *name, size_t taskNameLen);
+void ITTAPI __itt_model_iteration_taskA(const char *name);
+void ITTAPI __itt_model_iteration_taskAL(const char *name, size_t taskNameLen);
+void ITTAPI __itt_model_task_end  (__itt_model_task *task, __itt_model_task_instance *instance);
+void ITTAPI __itt_model_task_end_2(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_task_begin,  (__itt_model_task *task, __itt_model_task_instance *instance, const char *name))
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, model_task_beginW,  (const wchar_t *name))
+ITT_STUBV(ITTAPI, void, model_iteration_taskW, (const wchar_t *name))
+#endif
+ITT_STUBV(ITTAPI, void, model_task_beginA,  (const char *name))
+ITT_STUBV(ITTAPI, void, model_task_beginAL,  (const char *name, size_t taskNameLen))
+ITT_STUBV(ITTAPI, void, model_iteration_taskA,  (const char *name))
+ITT_STUBV(ITTAPI, void, model_iteration_taskAL,  (const char *name, size_t taskNameLen))
+ITT_STUBV(ITTAPI, void, model_task_end,    (__itt_model_task *task, __itt_model_task_instance *instance))
+ITT_STUBV(ITTAPI, void, model_task_end_2,  (void))
+#define __itt_model_task_begin      ITTNOTIFY_VOID(model_task_begin)
+#define __itt_model_task_begin_ptr  ITTNOTIFY_NAME(model_task_begin)
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_task_beginW     ITTNOTIFY_VOID(model_task_beginW)
+#define __itt_model_task_beginW_ptr ITTNOTIFY_NAME(model_task_beginW)
+#define __itt_model_iteration_taskW     ITTNOTIFY_VOID(model_iteration_taskW)
+#define __itt_model_iteration_taskW_ptr ITTNOTIFY_NAME(model_iteration_taskW)
+#endif
+#define __itt_model_task_beginA    ITTNOTIFY_VOID(model_task_beginA)
+#define __itt_model_task_beginA_ptr ITTNOTIFY_NAME(model_task_beginA)
+#define __itt_model_task_beginAL    ITTNOTIFY_VOID(model_task_beginAL)
+#define __itt_model_task_beginAL_ptr ITTNOTIFY_NAME(model_task_beginAL)
+#define __itt_model_iteration_taskA    ITTNOTIFY_VOID(model_iteration_taskA)
+#define __itt_model_iteration_taskA_ptr ITTNOTIFY_NAME(model_iteration_taskA)
+#define __itt_model_iteration_taskAL    ITTNOTIFY_VOID(model_iteration_taskAL)
+#define __itt_model_iteration_taskAL_ptr ITTNOTIFY_NAME(model_iteration_taskAL)
+#define __itt_model_task_end        ITTNOTIFY_VOID(model_task_end)
+#define __itt_model_task_end_ptr    ITTNOTIFY_NAME(model_task_end)
+#define __itt_model_task_end_2        ITTNOTIFY_VOID(model_task_end_2)
+#define __itt_model_task_end_2_ptr    ITTNOTIFY_NAME(model_task_end_2)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_task_begin(task, instance, name)
+#define __itt_model_task_begin_ptr  0
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_task_beginW(name)
+#define __itt_model_task_beginW_ptr  0
+#endif
+#define __itt_model_task_beginA(name)
+#define __itt_model_task_beginA_ptr  0
+#define __itt_model_task_beginAL(name, siteNameLen)
+#define __itt_model_task_beginAL_ptr  0
+#define __itt_model_iteration_taskA(name)
+#define __itt_model_iteration_taskA_ptr  0
+#define __itt_model_iteration_taskAL(name, siteNameLen)
+#define __itt_model_iteration_taskAL_ptr  0
+#define __itt_model_task_end(task, instance)
+#define __itt_model_task_end_ptr    0
+#define __itt_model_task_end_2()
+#define __itt_model_task_end_2_ptr    0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_task_begin_ptr  0
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_task_beginW_ptr 0
+#endif
+#define __itt_model_task_beginA_ptr  0
+#define __itt_model_task_beginAL_ptr  0
+#define __itt_model_iteration_taskA_ptr    0
+#define __itt_model_iteration_taskAL_ptr    0
+#define __itt_model_task_end_ptr    0
+#define __itt_model_task_end_2_ptr    0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_LOCK_ACQUIRE/ANNOTATE_LOCK_RELEASE support
+ *
+ * lock_acquire/release model a potential lock for both lockset and
+ * performance modeling.  Each unique address is modeled as a separate
+ * lock, with invalid addresses being valid lock IDs.  Specifically:
+ * no storage is accessed by the API at the specified address - it is only
+ * used for lock identification.  Lock acquires may be self-nested and are
+ * unlocked by a corresponding number of releases.
+ * (These closely correspond to __itt_sync_acquired/__itt_sync_releasing,
+ * but may not have identical semantics.)
+ */
+void ITTAPI __itt_model_lock_acquire(void *lock);
+void ITTAPI __itt_model_lock_acquire_2(void *lock);
+void ITTAPI __itt_model_lock_release(void *lock);
+void ITTAPI __itt_model_lock_release_2(void *lock);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_lock_acquire, (void *lock))
+ITT_STUBV(ITTAPI, void, model_lock_acquire_2, (void *lock))
+ITT_STUBV(ITTAPI, void, model_lock_release, (void *lock))
+ITT_STUBV(ITTAPI, void, model_lock_release_2, (void *lock))
+#define __itt_model_lock_acquire     ITTNOTIFY_VOID(model_lock_acquire)
+#define __itt_model_lock_acquire_ptr ITTNOTIFY_NAME(model_lock_acquire)
+#define __itt_model_lock_acquire_2     ITTNOTIFY_VOID(model_lock_acquire_2)
+#define __itt_model_lock_acquire_2_ptr ITTNOTIFY_NAME(model_lock_acquire_2)
+#define __itt_model_lock_release     ITTNOTIFY_VOID(model_lock_release)
+#define __itt_model_lock_release_ptr ITTNOTIFY_NAME(model_lock_release)
+#define __itt_model_lock_release_2     ITTNOTIFY_VOID(model_lock_release_2)
+#define __itt_model_lock_release_2_ptr ITTNOTIFY_NAME(model_lock_release_2)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_lock_acquire(lock)
+#define __itt_model_lock_acquire_ptr 0
+#define __itt_model_lock_acquire_2(lock)
+#define __itt_model_lock_acquire_2_ptr 0
+#define __itt_model_lock_release(lock)
+#define __itt_model_lock_release_ptr 0
+#define __itt_model_lock_release_2(lock)
+#define __itt_model_lock_release_2_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_lock_acquire_ptr 0
+#define __itt_model_lock_acquire_2_ptr 0
+#define __itt_model_lock_release_ptr 0
+#define __itt_model_lock_release_2_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_RECORD_ALLOCATION/ANNOTATE_RECORD_DEALLOCATION support
+ *
+ * record_allocation/deallocation describe user-defined memory allocator
+ * behavior, which may be required for correctness modeling to understand
+ * when storage is not expected to be actually reused across threads.
+ */
+void ITTAPI __itt_model_record_allocation  (void *addr, size_t size);
+void ITTAPI __itt_model_record_deallocation(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_record_allocation,   (void *addr, size_t size))
+ITT_STUBV(ITTAPI, void, model_record_deallocation, (void *addr))
+#define __itt_model_record_allocation       ITTNOTIFY_VOID(model_record_allocation)
+#define __itt_model_record_allocation_ptr   ITTNOTIFY_NAME(model_record_allocation)
+#define __itt_model_record_deallocation     ITTNOTIFY_VOID(model_record_deallocation)
+#define __itt_model_record_deallocation_ptr ITTNOTIFY_NAME(model_record_deallocation)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_record_allocation(addr, size)
+#define __itt_model_record_allocation_ptr   0
+#define __itt_model_record_deallocation(addr)
+#define __itt_model_record_deallocation_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_record_allocation_ptr   0
+#define __itt_model_record_deallocation_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_INDUCTION_USES support
+ *
+ * Note particular storage is inductive through the end of the current site
+ */
+void ITTAPI __itt_model_induction_uses(void* addr, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_induction_uses, (void *addr, size_t size))
+#define __itt_model_induction_uses     ITTNOTIFY_VOID(model_induction_uses)
+#define __itt_model_induction_uses_ptr ITTNOTIFY_NAME(model_induction_uses)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_induction_uses(addr, size)
+#define __itt_model_induction_uses_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_induction_uses_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_REDUCTION_USES support
+ *
+ * Note particular storage is used for reduction through the end
+ * of the current site
+ */
+void ITTAPI __itt_model_reduction_uses(void* addr, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_reduction_uses, (void *addr, size_t size))
+#define __itt_model_reduction_uses     ITTNOTIFY_VOID(model_reduction_uses)
+#define __itt_model_reduction_uses_ptr ITTNOTIFY_NAME(model_reduction_uses)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_reduction_uses(addr, size)
+#define __itt_model_reduction_uses_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_reduction_uses_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_OBSERVE_USES support
+ *
+ * Have correctness modeling record observations about uses of storage
+ * through the end of the current site
+ */
+void ITTAPI __itt_model_observe_uses(void* addr, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_observe_uses, (void *addr, size_t size))
+#define __itt_model_observe_uses     ITTNOTIFY_VOID(model_observe_uses)
+#define __itt_model_observe_uses_ptr ITTNOTIFY_NAME(model_observe_uses)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_observe_uses(addr, size)
+#define __itt_model_observe_uses_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_observe_uses_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_CLEAR_USES support
+ *
+ * Clear the special handling of a piece of storage related to induction,
+ * reduction or observe_uses
+ */
+void ITTAPI __itt_model_clear_uses(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_clear_uses, (void *addr))
+#define __itt_model_clear_uses     ITTNOTIFY_VOID(model_clear_uses)
+#define __itt_model_clear_uses_ptr ITTNOTIFY_NAME(model_clear_uses)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_clear_uses(addr)
+#define __itt_model_clear_uses_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_clear_uses_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_DISABLE_*_PUSH/ANNOTATE_DISABLE_*_POP support
+ *
+ * disable_push/disable_pop push and pop disabling based on a parameter.
+ * Disabling observations stops processing of memory references during
+ * correctness modeling, and all annotations that occur in the disabled
+ * region.  This allows description of code that is expected to be handled
+ * specially during conversion to parallelism or that is not recognized
+ * by tools (e.g. some kinds of synchronization operations.)
+ * This mechanism causes all annotations in the disabled region, other
+ * than disable_push and disable_pop, to be ignored.  (For example, this
+ * might validly be used to disable an entire parallel site and the contained
+ * tasks and locking in it for data collection purposes.)
+ * The disable for collection is a more expensive operation, but reduces
+ * collector overhead significantly.  This applies to BOTH correctness data
+ * collection and performance data collection.  For example, a site
+ * containing a task might only enable data collection for the first 10
+ * iterations.  Both performance and correctness data should reflect this,
+ * and the program should run as close to full speed as possible when
+ * collection is disabled.
+ */
+void ITTAPI __itt_model_disable_push(__itt_model_disable x);
+void ITTAPI __itt_model_disable_pop(void);
+void ITTAPI __itt_model_aggregate_task(size_t x);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_disable_push, (__itt_model_disable x))
+ITT_STUBV(ITTAPI, void, model_disable_pop,  (void))
+ITT_STUBV(ITTAPI, void, model_aggregate_task, (size_t x))
+#define __itt_model_disable_push     ITTNOTIFY_VOID(model_disable_push)
+#define __itt_model_disable_push_ptr ITTNOTIFY_NAME(model_disable_push)
+#define __itt_model_disable_pop      ITTNOTIFY_VOID(model_disable_pop)
+#define __itt_model_disable_pop_ptr  ITTNOTIFY_NAME(model_disable_pop)
+#define __itt_model_aggregate_task      ITTNOTIFY_VOID(model_aggregate_task)
+#define __itt_model_aggregate_task_ptr  ITTNOTIFY_NAME(model_aggregate_task)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_disable_push(x)
+#define __itt_model_disable_push_ptr 0
+#define __itt_model_disable_pop()
+#define __itt_model_disable_pop_ptr 0
+#define __itt_model_aggregate_task(x)
+#define __itt_model_aggregate_task_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_disable_push_ptr 0
+#define __itt_model_disable_pop_ptr 0
+#define __itt_model_aggregate_task_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} model group */
+
+/**
+ * @defgroup heap Heap
+ * @ingroup public
+ * Heap group
+ * @{
+ */
+
+typedef void* __itt_heap_function;
+
+/**
+ * @brief Create an identification for heap function
+ * @return non-zero identifier or NULL
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_heap_function ITTAPI __itt_heap_function_createA(const char*    name, const char*    domain);
+__itt_heap_function ITTAPI __itt_heap_function_createW(const wchar_t* name, const wchar_t* domain);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_heap_function_create     __itt_heap_function_createW
+#  define __itt_heap_function_create_ptr __itt_heap_function_createW_ptr
+#else
+#  define __itt_heap_function_create     __itt_heap_function_createA
+#  define __itt_heap_function_create_ptr __itt_heap_function_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_heap_function ITTAPI __itt_heap_function_create(const char* name, const char* domain);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createA, (const char*    name, const char*    domain))
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createW, (const wchar_t* name, const wchar_t* domain))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_create,  (const char*    name, const char*    domain))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_heap_function_createA     ITTNOTIFY_DATA(heap_function_createA)
+#define __itt_heap_function_createA_ptr ITTNOTIFY_NAME(heap_function_createA)
+#define __itt_heap_function_createW     ITTNOTIFY_DATA(heap_function_createW)
+#define __itt_heap_function_createW_ptr ITTNOTIFY_NAME(heap_function_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_heap_function_create      ITTNOTIFY_DATA(heap_function_create)
+#define __itt_heap_function_create_ptr  ITTNOTIFY_NAME(heap_function_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_heap_function_createA(name, domain) (__itt_heap_function)0
+#define __itt_heap_function_createA_ptr 0
+#define __itt_heap_function_createW(name, domain) (__itt_heap_function)0
+#define __itt_heap_function_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_heap_function_create(name, domain)  (__itt_heap_function)0
+#define __itt_heap_function_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_heap_function_createA_ptr 0
+#define __itt_heap_function_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_heap_function_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an allocation begin occurrence.
+ */
+void ITTAPI __itt_heap_allocate_begin(__itt_heap_function h, size_t size, int initialized);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_allocate_begin, (__itt_heap_function h, size_t size, int initialized))
+#define __itt_heap_allocate_begin     ITTNOTIFY_VOID(heap_allocate_begin)
+#define __itt_heap_allocate_begin_ptr ITTNOTIFY_NAME(heap_allocate_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_allocate_begin(h, size, initialized)
+#define __itt_heap_allocate_begin_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_allocate_begin_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an allocation end occurrence.
+ */
+void ITTAPI __itt_heap_allocate_end(__itt_heap_function h, void** addr, size_t size, int initialized);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_allocate_end, (__itt_heap_function h, void** addr, size_t size, int initialized))
+#define __itt_heap_allocate_end     ITTNOTIFY_VOID(heap_allocate_end)
+#define __itt_heap_allocate_end_ptr ITTNOTIFY_NAME(heap_allocate_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_allocate_end(h, addr, size, initialized)
+#define __itt_heap_allocate_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_allocate_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an free begin occurrence.
+ */
+void ITTAPI __itt_heap_free_begin(__itt_heap_function h, void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_free_begin, (__itt_heap_function h, void* addr))
+#define __itt_heap_free_begin     ITTNOTIFY_VOID(heap_free_begin)
+#define __itt_heap_free_begin_ptr ITTNOTIFY_NAME(heap_free_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_free_begin(h, addr)
+#define __itt_heap_free_begin_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_free_begin_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an free end occurrence.
+ */
+void ITTAPI __itt_heap_free_end(__itt_heap_function h, void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_free_end, (__itt_heap_function h, void* addr))
+#define __itt_heap_free_end     ITTNOTIFY_VOID(heap_free_end)
+#define __itt_heap_free_end_ptr ITTNOTIFY_NAME(heap_free_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_free_end(h, addr)
+#define __itt_heap_free_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_free_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an reallocation begin occurrence.
+ */
+void ITTAPI __itt_heap_reallocate_begin(__itt_heap_function h, void* addr, size_t new_size, int initialized);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_reallocate_begin, (__itt_heap_function h, void* addr, size_t new_size, int initialized))
+#define __itt_heap_reallocate_begin     ITTNOTIFY_VOID(heap_reallocate_begin)
+#define __itt_heap_reallocate_begin_ptr ITTNOTIFY_NAME(heap_reallocate_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_reallocate_begin(h, addr, new_size, initialized)
+#define __itt_heap_reallocate_begin_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_reallocate_begin_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an reallocation end occurrence.
+ */
+void ITTAPI __itt_heap_reallocate_end(__itt_heap_function h, void* addr, void** new_addr, size_t new_size, int initialized);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_reallocate_end, (__itt_heap_function h, void* addr, void** new_addr, size_t new_size, int initialized))
+#define __itt_heap_reallocate_end     ITTNOTIFY_VOID(heap_reallocate_end)
+#define __itt_heap_reallocate_end_ptr ITTNOTIFY_NAME(heap_reallocate_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_reallocate_end(h, addr, new_addr, new_size, initialized)
+#define __itt_heap_reallocate_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_reallocate_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief internal access begin */
+void ITTAPI __itt_heap_internal_access_begin(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_internal_access_begin,  (void))
+#define __itt_heap_internal_access_begin      ITTNOTIFY_VOID(heap_internal_access_begin)
+#define __itt_heap_internal_access_begin_ptr  ITTNOTIFY_NAME(heap_internal_access_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_internal_access_begin()
+#define __itt_heap_internal_access_begin_ptr  0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_internal_access_begin_ptr  0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief internal access end */
+void ITTAPI __itt_heap_internal_access_end(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_internal_access_end, (void))
+#define __itt_heap_internal_access_end     ITTNOTIFY_VOID(heap_internal_access_end)
+#define __itt_heap_internal_access_end_ptr ITTNOTIFY_NAME(heap_internal_access_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_internal_access_end()
+#define __itt_heap_internal_access_end_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_internal_access_end_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief record memory growth begin */
+void ITTAPI __itt_heap_record_memory_growth_begin(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_record_memory_growth_begin,  (void))
+#define __itt_heap_record_memory_growth_begin      ITTNOTIFY_VOID(heap_record_memory_growth_begin)
+#define __itt_heap_record_memory_growth_begin_ptr  ITTNOTIFY_NAME(heap_record_memory_growth_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_record_memory_growth_begin()
+#define __itt_heap_record_memory_growth_begin_ptr  0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_record_memory_growth_begin_ptr  0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief record memory growth end */
+void ITTAPI __itt_heap_record_memory_growth_end(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_record_memory_growth_end, (void))
+#define __itt_heap_record_memory_growth_end     ITTNOTIFY_VOID(heap_record_memory_growth_end)
+#define __itt_heap_record_memory_growth_end_ptr ITTNOTIFY_NAME(heap_record_memory_growth_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_record_memory_growth_end()
+#define __itt_heap_record_memory_growth_end_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_record_memory_growth_end_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Specify the type of heap detection/reporting to modify.
+ */
+/**
+ * @hideinitializer
+ * @brief Report on memory leaks.
+ */
+#define __itt_heap_leaks 0x00000001
+
+/**
+ * @hideinitializer
+ * @brief Report on memory growth.
+ */
+#define __itt_heap_growth 0x00000002
+
+
+/** @brief heap reset detection */
+void ITTAPI __itt_heap_reset_detection(unsigned int reset_mask);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_reset_detection,  (unsigned int reset_mask))
+#define __itt_heap_reset_detection      ITTNOTIFY_VOID(heap_reset_detection)
+#define __itt_heap_reset_detection_ptr  ITTNOTIFY_NAME(heap_reset_detection)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_reset_detection()
+#define __itt_heap_reset_detection_ptr  0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_reset_detection_ptr  0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief report */
+void ITTAPI __itt_heap_record(unsigned int record_mask);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_record, (unsigned int record_mask))
+#define __itt_heap_record     ITTNOTIFY_VOID(heap_record)
+#define __itt_heap_record_ptr ITTNOTIFY_NAME(heap_record)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_record()
+#define __itt_heap_record_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_record_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @} heap group */
+/** @endcond */
+/* ========================================================================== */
+
+/**
+ * @defgroup domains Domains
+ * @ingroup public
+ * Domains group
+ * @{
+ */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_domain
+{
+    volatile int flags; /*!< Zero if disabled, non-zero if enabled. The meaning of different non-zero values is reserved to the runtime */
+    const char* nameA;  /*!< Copy of original name in ASCII. */
+#if defined(UNICODE) || defined(_UNICODE)
+    const wchar_t* nameW; /*!< Copy of original name in UNICODE. */
+#else  /* UNICODE || _UNICODE */
+    void* nameW;
+#endif /* UNICODE || _UNICODE */
+    int   extra1; /*!< Reserved to the runtime */
+    void* extra2; /*!< Reserved to the runtime */
+    struct ___itt_domain* next;
+} __itt_domain;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @ingroup domains
+ * @brief Create a domain.
+ * Create domain using some domain name: the URI naming style is recommended.
+ * Because the set of domains is expected to be static over the application's
+ * execution time, there is no mechanism to destroy a domain.
+ * Any domain can be accessed by any thread in the process, regardless of
+ * which thread created the domain. This call is thread-safe.
+ * @param[in] name name of domain
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_domain* ITTAPI __itt_domain_createA(const char    *name);
+__itt_domain* ITTAPI __itt_domain_createW(const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_domain_create     __itt_domain_createW
+#  define __itt_domain_create_ptr __itt_domain_createW_ptr
+#else /* UNICODE */
+#  define __itt_domain_create     __itt_domain_createA
+#  define __itt_domain_create_ptr __itt_domain_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_domain* ITTAPI __itt_domain_create(const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_domain*, domain_createA, (const char    *name))
+ITT_STUB(ITTAPI, __itt_domain*, domain_createW, (const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_domain*, domain_create,  (const char    *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_domain_createA     ITTNOTIFY_DATA(domain_createA)
+#define __itt_domain_createA_ptr ITTNOTIFY_NAME(domain_createA)
+#define __itt_domain_createW     ITTNOTIFY_DATA(domain_createW)
+#define __itt_domain_createW_ptr ITTNOTIFY_NAME(domain_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_domain_create     ITTNOTIFY_DATA(domain_create)
+#define __itt_domain_create_ptr ITTNOTIFY_NAME(domain_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_domain_createA(name) (__itt_domain*)0
+#define __itt_domain_createA_ptr 0
+#define __itt_domain_createW(name) (__itt_domain*)0
+#define __itt_domain_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_domain_create(name)  (__itt_domain*)0
+#define __itt_domain_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_domain_createA_ptr 0
+#define __itt_domain_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_domain_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} domains group */
+
+/**
+ * @defgroup ids IDs
+ * @ingroup public
+ * IDs group
+ * @{
+ */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_id
+{
+    unsigned long long d1, d2, d3;
+} __itt_id;
+
+#pragma pack(pop)
+/** @endcond */
+
+static const __itt_id __itt_null = { 0, 0, 0 };
+
+/**
+ * @ingroup ids
+ * @brief A convenience function is provided to create an ID without domain control.
+ * @brief This is a convenience function to initialize an __itt_id structure. This function
+ * does not affect the collector runtime in any way. After you make the ID with this
+ * function, you still must create it with the __itt_id_create function before using the ID
+ * to identify a named entity.
+ * @param[in] addr The address of object; high QWORD of the ID value.
+ * @param[in] extra The extra data to unique identify object; low QWORD of the ID value.
+ */
+
+ITT_INLINE __itt_id ITTAPI __itt_id_make(void* addr, unsigned long long extra) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE __itt_id ITTAPI __itt_id_make(void* addr, unsigned long long extra)
+{
+    __itt_id id = __itt_null;
+    id.d1 = (unsigned long long)((uintptr_t)addr);
+    id.d2 = (unsigned long long)extra;
+    id.d3 = (unsigned long long)0; /* Reserved. Must be zero */
+    return id;
+}
+
+/**
+ * @ingroup ids
+ * @brief Create an instance of identifier.
+ * This establishes the beginning of the lifetime of an instance of
+ * the given ID in the trace. Once this lifetime starts, the ID
+ * can be used to tag named entity instances in calls such as
+ * __itt_task_begin, and to specify relationships among
+ * identified named entity instances, using the \ref relations APIs.
+ * Instance IDs are not domain specific!
+ * @param[in] domain The domain controlling the execution of this call.
+ * @param[in] id The ID to create.
+ */
+void ITTAPI __itt_id_create(const __itt_domain *domain, __itt_id id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, id_create, (const __itt_domain *domain, __itt_id id))
+#define __itt_id_create(d,x) ITTNOTIFY_VOID_D1(id_create,d,x)
+#define __itt_id_create_ptr  ITTNOTIFY_NAME(id_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_id_create(domain,id)
+#define __itt_id_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_id_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup ids
+ * @brief Destroy an instance of identifier.
+ * This ends the lifetime of the current instance of the given ID value in the trace.
+ * Any relationships that are established after this lifetime ends are invalid.
+ * This call must be performed before the given ID value can be reused for a different
+ * named entity instance.
+ * @param[in] domain The domain controlling the execution of this call.
+ * @param[in] id The ID to destroy.
+ */
+void ITTAPI __itt_id_destroy(const __itt_domain *domain, __itt_id id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, id_destroy, (const __itt_domain *domain, __itt_id id))
+#define __itt_id_destroy(d,x) ITTNOTIFY_VOID_D1(id_destroy,d,x)
+#define __itt_id_destroy_ptr  ITTNOTIFY_NAME(id_destroy)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_id_destroy(domain,id)
+#define __itt_id_destroy_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_id_destroy_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} ids group */
+
+/**
+ * @defgroup handless String Handles
+ * @ingroup public
+ * String Handles group
+ * @{
+ */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_string_handle
+{
+    const char* strA; /*!< Copy of original string in ASCII. */
+#if defined(UNICODE) || defined(_UNICODE)
+    const wchar_t* strW; /*!< Copy of original string in UNICODE. */
+#else  /* UNICODE || _UNICODE */
+    void* strW;
+#endif /* UNICODE || _UNICODE */
+    int   extra1; /*!< Reserved. Must be zero   */
+    void* extra2; /*!< Reserved. Must be zero   */
+    struct ___itt_string_handle* next;
+} __itt_string_handle;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @ingroup handles
+ * @brief Create a string handle.
+ * Create and return handle value that can be associated with a string.
+ * Consecutive calls to __itt_string_handle_create with the same name
+ * return the same value. Because the set of string handles is expected to remain
+ * static during the application's execution time, there is no mechanism to destroy a string handle.
+ * Any string handle can be accessed by any thread in the process, regardless of which thread created
+ * the string handle. This call is thread-safe.
+ * @param[in] name The input string
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_string_handle* ITTAPI __itt_string_handle_createA(const char    *name);
+__itt_string_handle* ITTAPI __itt_string_handle_createW(const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_string_handle_create     __itt_string_handle_createW
+#  define __itt_string_handle_create_ptr __itt_string_handle_createW_ptr
+#else /* UNICODE */
+#  define __itt_string_handle_create     __itt_string_handle_createA
+#  define __itt_string_handle_create_ptr __itt_string_handle_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_string_handle* ITTAPI __itt_string_handle_create(const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createA, (const char    *name))
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createW, (const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_create,  (const char    *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_string_handle_createA     ITTNOTIFY_DATA(string_handle_createA)
+#define __itt_string_handle_createA_ptr ITTNOTIFY_NAME(string_handle_createA)
+#define __itt_string_handle_createW     ITTNOTIFY_DATA(string_handle_createW)
+#define __itt_string_handle_createW_ptr ITTNOTIFY_NAME(string_handle_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_string_handle_create     ITTNOTIFY_DATA(string_handle_create)
+#define __itt_string_handle_create_ptr ITTNOTIFY_NAME(string_handle_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_string_handle_createA(name) (__itt_string_handle*)0
+#define __itt_string_handle_createA_ptr 0
+#define __itt_string_handle_createW(name) (__itt_string_handle*)0
+#define __itt_string_handle_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_string_handle_create(name)  (__itt_string_handle*)0
+#define __itt_string_handle_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_string_handle_createA_ptr 0
+#define __itt_string_handle_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_string_handle_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} handles group */
+
+/** @cond exclude_from_documentation */
+typedef unsigned long long __itt_timestamp;
+/** @endcond */
+
+#define __itt_timestamp_none ((__itt_timestamp)-1LL)
+
+/** @cond exclude_from_gpa_documentation */
+
+/**
+ * @ingroup timestamps
+ * @brief Return timestamp corresponding to the current moment.
+ * This returns the timestamp in the format that is the most relevant for the current
+ * host or platform (RDTSC, QPC, and others). You can use the "<" operator to
+ * compare __itt_timestamp values.
+ */
+__itt_timestamp ITTAPI __itt_get_timestamp(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_timestamp, get_timestamp, (void))
+#define __itt_get_timestamp      ITTNOTIFY_DATA(get_timestamp)
+#define __itt_get_timestamp_ptr  ITTNOTIFY_NAME(get_timestamp)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_get_timestamp()
+#define __itt_get_timestamp_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_get_timestamp_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} timestamps */
+/** @endcond */
+
+/** @cond exclude_from_gpa_documentation */
+
+/**
+ * @defgroup regions Regions
+ * @ingroup public
+ * Regions group
+ * @{
+ */
+/**
+ * @ingroup regions
+ * @brief Begin of region instance.
+ * Successive calls to __itt_region_begin with the same ID are ignored
+ * until a call to __itt_region_end with the same ID
+ * @param[in] domain The domain for this region instance
+ * @param[in] id The instance ID for this region instance. Must not be __itt_null
+ * @param[in] parentid The instance ID for the parent of this region instance, or __itt_null
+ * @param[in] name The name of this region
+ */
+void ITTAPI __itt_region_begin(const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name);
+
+/**
+ * @ingroup regions
+ * @brief End of region instance.
+ * The first call to __itt_region_end with a given ID ends the
+ * region. Successive calls with the same ID are ignored, as are
+ * calls that do not have a matching __itt_region_begin call.
+ * @param[in] domain The domain for this region instance
+ * @param[in] id The instance ID for this region instance
+ */
+void ITTAPI __itt_region_end(const __itt_domain *domain, __itt_id id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, region_begin, (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, region_end,   (const __itt_domain *domain, __itt_id id))
+#define __itt_region_begin(d,x,y,z) ITTNOTIFY_VOID_D3(region_begin,d,x,y,z)
+#define __itt_region_begin_ptr      ITTNOTIFY_NAME(region_begin)
+#define __itt_region_end(d,x)       ITTNOTIFY_VOID_D1(region_end,d,x)
+#define __itt_region_end_ptr        ITTNOTIFY_NAME(region_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_region_begin(d,x,y,z)
+#define __itt_region_begin_ptr 0
+#define __itt_region_end(d,x)
+#define __itt_region_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_region_begin_ptr 0
+#define __itt_region_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} regions group */
+
+/**
+ * @defgroup frames Frames
+ * @ingroup public
+ * Frames are similar to regions, but are intended to be easier to use and to implement.
+ * In particular:
+ * - Frames always represent periods of elapsed time
+ * - By default, frames have no nesting relationships
+ * @{
+ */
+
+/**
+ * @ingroup frames
+ * @brief Begin a frame instance.
+ * Successive calls to __itt_frame_begin with the
+ * same ID are ignored until a call to __itt_frame_end with the same ID.
+ * @param[in] domain The domain for this frame instance
+ * @param[in] id The instance ID for this frame instance or NULL
+ */
+void ITTAPI __itt_frame_begin_v3(const __itt_domain *domain, __itt_id *id);
+
+/**
+ * @ingroup frames
+ * @brief End a frame instance.
+ * The first call to __itt_frame_end with a given ID
+ * ends the frame. Successive calls with the same ID are ignored, as are
+ * calls that do not have a matching __itt_frame_begin call.
+ * @param[in] domain The domain for this frame instance
+ * @param[in] id The instance ID for this frame instance or NULL for current
+ */
+void ITTAPI __itt_frame_end_v3(const __itt_domain *domain, __itt_id *id);
+
+/**
+ * @ingroup frames
+ * @brief Submits a frame instance.
+ * Successive calls to __itt_frame_begin or __itt_frame_submit with the
+ * same ID are ignored until a call to __itt_frame_end or __itt_frame_submit
+ * with the same ID.
+ * Passing special __itt_timestamp_none value as "end" argument means
+ * take the current timestamp as the end timestamp.
+ * @param[in] domain The domain for this frame instance
+ * @param[in] id The instance ID for this frame instance or NULL
+ * @param[in] begin Timestamp of the beginning of the frame
+ * @param[in] end Timestamp of the end of the frame
+ */
+void ITTAPI __itt_frame_submit_v3(const __itt_domain *domain, __itt_id *id,
+    __itt_timestamp begin, __itt_timestamp end);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, frame_begin_v3,  (const __itt_domain *domain, __itt_id *id))
+ITT_STUBV(ITTAPI, void, frame_end_v3,    (const __itt_domain *domain, __itt_id *id))
+ITT_STUBV(ITTAPI, void, frame_submit_v3, (const __itt_domain *domain, __itt_id *id, __itt_timestamp begin, __itt_timestamp end))
+#define __itt_frame_begin_v3(d,x)      ITTNOTIFY_VOID_D1(frame_begin_v3,d,x)
+#define __itt_frame_begin_v3_ptr       ITTNOTIFY_NAME(frame_begin_v3)
+#define __itt_frame_end_v3(d,x)        ITTNOTIFY_VOID_D1(frame_end_v3,d,x)
+#define __itt_frame_end_v3_ptr         ITTNOTIFY_NAME(frame_end_v3)
+#define __itt_frame_submit_v3(d,x,b,e) ITTNOTIFY_VOID_D3(frame_submit_v3,d,x,b,e)
+#define __itt_frame_submit_v3_ptr      ITTNOTIFY_NAME(frame_submit_v3)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_frame_begin_v3(domain,id)
+#define __itt_frame_begin_v3_ptr 0
+#define __itt_frame_end_v3(domain,id)
+#define __itt_frame_end_v3_ptr   0
+#define __itt_frame_submit_v3(domain,id,begin,end)
+#define __itt_frame_submit_v3_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_frame_begin_v3_ptr 0
+#define __itt_frame_end_v3_ptr   0
+#define __itt_frame_submit_v3_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} frames group */
+/** @endcond */
+
+/**
+ * @defgroup taskgroup Task Group
+ * @ingroup public
+ * Task Group
+ * @{
+ */
+/**
+ * @ingroup task_groups
+ * @brief Denotes a task_group instance.
+ * Successive calls to __itt_task_group with the same ID are ignored.
+ * @param[in] domain The domain for this task_group instance
+ * @param[in] id The instance ID for this task_group instance. Must not be __itt_null.
+ * @param[in] parentid The instance ID for the parent of this task_group instance, or __itt_null.
+ * @param[in] name The name of this task_group
+ */
+void ITTAPI __itt_task_group(const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, task_group, (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name))
+#define __itt_task_group(d,x,y,z) ITTNOTIFY_VOID_D3(task_group,d,x,y,z)
+#define __itt_task_group_ptr      ITTNOTIFY_NAME(task_group)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_group(d,x,y,z)
+#define __itt_task_group_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_task_group_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} taskgroup group */
+
+/**
+ * @defgroup tasks Tasks
+ * @ingroup public
+ * A task instance represents a piece of work performed by a particular
+ * thread for a period of time. A call to __itt_task_begin creates a
+ * task instance. This becomes the current instance for that task on that
+ * thread. A following call to __itt_task_end on the same thread ends the
+ * instance. There may be multiple simultaneous instances of tasks with the
+ * same name on different threads. If an ID is specified, the task instance
+ * receives that ID. Nested tasks are allowed.
+ *
+ * Note: The task is defined by the bracketing of __itt_task_begin and
+ * __itt_task_end on the same thread. If some scheduling mechanism causes
+ * task switching (the thread executes a different user task) or task
+ * switching (the user task switches to a different thread) then this breaks
+ * the notion of  current instance. Additional API calls are required to
+ * deal with that possibility.
+ * @{
+ */
+
+/**
+ * @ingroup tasks
+ * @brief Begin a task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] taskid The instance ID for this task instance, or __itt_null
+ * @param[in] parentid The parent instance to which this task instance belongs, or __itt_null
+ * @param[in] name The name of this task
+ */
+void ITTAPI __itt_task_begin(const __itt_domain *domain, __itt_id taskid, __itt_id parentid, __itt_string_handle *name);
+
+/**
+ * @ingroup tasks
+ * @brief Begin a task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] taskid The identifier for this task instance (may be 0)
+ * @param[in] parentid The parent of this task (may be 0)
+ * @param[in] fn The pointer to the function you are tracing
+ */
+void ITTAPI __itt_task_begin_fn(const __itt_domain *domain, __itt_id taskid, __itt_id parentid, void* fn);
+
+/**
+ * @ingroup tasks
+ * @brief End the current task instance.
+ * @param[in] domain The domain for this task
+ */
+void ITTAPI __itt_task_end(const __itt_domain *domain);
+
+/**
+ * @ingroup tasks
+ * @brief Begin an overlapped task instance.
+ * @param[in] domain The domain for this task.
+ * @param[in] taskid The identifier for this task instance, *cannot* be __itt_null.
+ * @param[in] parentid The parent of this task, or __itt_null.
+ * @param[in] name The name of this task.
+ */
+void ITTAPI __itt_task_begin_overlapped(const __itt_domain* domain, __itt_id taskid, __itt_id parentid, __itt_string_handle* name);
+
+/**
+ * @ingroup tasks
+ * @brief End an overlapped task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] taskid Explicit ID of finished task
+ */
+void ITTAPI __itt_task_end_overlapped(const __itt_domain *domain, __itt_id taskid);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, task_begin,    (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, task_begin_fn, (const __itt_domain *domain, __itt_id id, __itt_id parentid, void* fn))
+ITT_STUBV(ITTAPI, void, task_end,      (const __itt_domain *domain))
+ITT_STUBV(ITTAPI, void, task_begin_overlapped, (const __itt_domain *domain, __itt_id taskid, __itt_id parentid, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, task_end_overlapped,   (const __itt_domain *domain, __itt_id taskid))
+#define __itt_task_begin(d,x,y,z)    ITTNOTIFY_VOID_D3(task_begin,d,x,y,z)
+#define __itt_task_begin_ptr         ITTNOTIFY_NAME(task_begin)
+#define __itt_task_begin_fn(d,x,y,z) ITTNOTIFY_VOID_D3(task_begin_fn,d,x,y,z)
+#define __itt_task_begin_fn_ptr      ITTNOTIFY_NAME(task_begin_fn)
+#define __itt_task_end(d)            ITTNOTIFY_VOID_D0(task_end,d)
+#define __itt_task_end_ptr           ITTNOTIFY_NAME(task_end)
+#define __itt_task_begin_overlapped(d,x,y,z) ITTNOTIFY_VOID_D3(task_begin_overlapped,d,x,y,z)
+#define __itt_task_begin_overlapped_ptr      ITTNOTIFY_NAME(task_begin_overlapped)
+#define __itt_task_end_overlapped(d,x)       ITTNOTIFY_VOID_D1(task_end_overlapped,d,x)
+#define __itt_task_end_overlapped_ptr        ITTNOTIFY_NAME(task_end_overlapped)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_begin(domain,id,parentid,name)
+#define __itt_task_begin_ptr    0
+#define __itt_task_begin_fn(domain,id,parentid,fn)
+#define __itt_task_begin_fn_ptr 0
+#define __itt_task_end(domain)
+#define __itt_task_end_ptr      0
+#define __itt_task_begin_overlapped(domain,taskid,parentid,name)
+#define __itt_task_begin_overlapped_ptr         0
+#define __itt_task_end_overlapped(domain,taskid)
+#define __itt_task_end_overlapped_ptr           0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_task_begin_ptr    0
+#define __itt_task_begin_fn_ptr 0
+#define __itt_task_end_ptr      0
+#define __itt_task_begin_overlapped_ptr 0
+#define __itt_task_end_overlapped_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} tasks group */
+
+
+/**
+ * @defgroup markers Markers
+ * Markers represent a single discreet event in time. Markers have a scope,
+ * described by an enumerated type __itt_scope. Markers are created by
+ * the API call __itt_marker. A marker instance can be given an ID for use in
+ * adding metadata.
+ * @{
+ */
+
+/**
+ * @brief Describes the scope of an event object in the trace.
+ */
+typedef enum
+{
+    __itt_scope_unknown = 0,
+    __itt_scope_global,
+    __itt_scope_track_group,
+    __itt_scope_track,
+    __itt_scope_task,
+    __itt_scope_marker
+} __itt_scope;
+
+/** @cond exclude_from_documentation */
+#define __itt_marker_scope_unknown  __itt_scope_unknown
+#define __itt_marker_scope_global   __itt_scope_global
+#define __itt_marker_scope_process  __itt_scope_track_group
+#define __itt_marker_scope_thread   __itt_scope_track
+#define __itt_marker_scope_task     __itt_scope_task
+/** @endcond */
+
+/**
+ * @ingroup markers
+ * @brief Create a marker instance
+ * @param[in] domain The domain for this marker
+ * @param[in] id The instance ID for this marker or __itt_null
+ * @param[in] name The name for this marker
+ * @param[in] scope The scope for this marker
+ */
+void ITTAPI __itt_marker(const __itt_domain *domain, __itt_id id, __itt_string_handle *name, __itt_scope scope);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, marker, (const __itt_domain *domain, __itt_id id, __itt_string_handle *name, __itt_scope scope))
+#define __itt_marker(d,x,y,z) ITTNOTIFY_VOID_D3(marker,d,x,y,z)
+#define __itt_marker_ptr      ITTNOTIFY_NAME(marker)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_marker(domain,id,name,scope)
+#define __itt_marker_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_marker_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} markers group */
+
+/**
+ * @defgroup metadata Metadata
+ * The metadata API is used to attach extra information to named
+ * entities. Metadata can be attached to an identified named entity by ID,
+ * or to the current entity (which is always a task).
+ *
+ * Conceptually metadata has a type (what kind of metadata), a key (the
+ * name of the metadata), and a value (the actual data). The encoding of
+ * the value depends on the type of the metadata.
+ *
+ * The type of metadata is specified by an enumerated type __itt_metdata_type.
+ * @{
+ */
+
+/**
+ * @ingroup parameters
+ * @brief describes the type of metadata
+ */
+typedef enum {
+    __itt_metadata_unknown = 0,
+    __itt_metadata_u64,     /**< Unsigned 64-bit integer */
+    __itt_metadata_s64,     /**< Signed 64-bit integer */
+    __itt_metadata_u32,     /**< Unsigned 32-bit integer */
+    __itt_metadata_s32,     /**< Signed 32-bit integer */
+    __itt_metadata_u16,     /**< Unsigned 16-bit integer */
+    __itt_metadata_s16,     /**< Signed 16-bit integer */
+    __itt_metadata_float,   /**< Signed 32-bit floating-point */
+    __itt_metadata_double   /**< SIgned 64-bit floating-point */
+} __itt_metadata_type;
+
+/**
+ * @ingroup parameters
+ * @brief Add metadata to an instance of a named entity.
+ * @param[in] domain The domain controlling the call
+ * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+ * @param[in] key The name of the metadata
+ * @param[in] type The type of the metadata
+ * @param[in] count The number of elements of the given type. If count == 0, no metadata will be added.
+ * @param[in] data The metadata itself
+*/
+void ITTAPI __itt_metadata_add(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, metadata_add, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data))
+#define __itt_metadata_add(d,x,y,z,a,b) ITTNOTIFY_VOID_D5(metadata_add,d,x,y,z,a,b)
+#define __itt_metadata_add_ptr          ITTNOTIFY_NAME(metadata_add)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_metadata_add(d,x,y,z,a,b)
+#define __itt_metadata_add_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_metadata_add_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup parameters
+ * @brief Add string metadata to an instance of a named entity.
+ * @param[in] domain The domain controlling the call
+ * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+ * @param[in] key The name of the metadata
+ * @param[in] data The metadata itself
+ * @param[in] length The number of characters in the string, or -1 if the length is unknown but the string is null-terminated
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_metadata_str_addA(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length);
+void ITTAPI __itt_metadata_str_addW(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const wchar_t *data, size_t length);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_metadata_str_add     __itt_metadata_str_addW
+#  define __itt_metadata_str_add_ptr __itt_metadata_str_addW_ptr
+#else /* UNICODE */
+#  define __itt_metadata_str_add     __itt_metadata_str_addA
+#  define __itt_metadata_str_add_ptr __itt_metadata_str_addA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_metadata_str_add(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length);
+#endif
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, metadata_str_addA, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length))
+ITT_STUBV(ITTAPI, void, metadata_str_addW, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const wchar_t *data, size_t length))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, metadata_str_add, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_addA(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_addA,d,x,y,z,a)
+#define __itt_metadata_str_addA_ptr        ITTNOTIFY_NAME(metadata_str_addA)
+#define __itt_metadata_str_addW(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_addW,d,x,y,z,a)
+#define __itt_metadata_str_addW_ptr        ITTNOTIFY_NAME(metadata_str_addW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add(d,x,y,z,a)  ITTNOTIFY_VOID_D4(metadata_str_add,d,x,y,z,a)
+#define __itt_metadata_str_add_ptr         ITTNOTIFY_NAME(metadata_str_add)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_addA(d,x,y,z,a)
+#define __itt_metadata_str_addA_ptr 0
+#define __itt_metadata_str_addW(d,x,y,z,a)
+#define __itt_metadata_str_addW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add(d,x,y,z,a)
+#define __itt_metadata_str_add_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_addA_ptr 0
+#define __itt_metadata_str_addW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup parameters
+ * @brief Add metadata to an instance of a named entity.
+ * @param[in] domain The domain controlling the call
+ * @param[in] scope The scope of the instance to which the metadata is to be added
+
+ * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+
+ * @param[in] key The name of the metadata
+ * @param[in] type The type of the metadata
+ * @param[in] count The number of elements of the given type. If count == 0, no metadata will be added.
+ * @param[in] data The metadata itself
+*/
+void ITTAPI __itt_metadata_add_with_scope(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, metadata_add_with_scope, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data))
+#define __itt_metadata_add_with_scope(d,x,y,z,a,b) ITTNOTIFY_VOID_D5(metadata_add_with_scope,d,x,y,z,a,b)
+#define __itt_metadata_add_with_scope_ptr          ITTNOTIFY_NAME(metadata_add_with_scope)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_metadata_add_with_scope(d,x,y,z,a,b)
+#define __itt_metadata_add_with_scope_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_metadata_add_with_scope_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup parameters
+ * @brief Add string metadata to an instance of a named entity.
+ * @param[in] domain The domain controlling the call
+ * @param[in] scope The scope of the instance to which the metadata is to be added
+
+ * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+
+ * @param[in] key The name of the metadata
+ * @param[in] data The metadata itself
+ * @param[in] length The number of characters in the string, or -1 if the length is unknown but the string is null-terminated
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_metadata_str_add_with_scopeA(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length);
+void ITTAPI __itt_metadata_str_add_with_scopeW(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const wchar_t *data, size_t length);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_metadata_str_add_with_scope     __itt_metadata_str_add_with_scopeW
+#  define __itt_metadata_str_add_with_scope_ptr __itt_metadata_str_add_with_scopeW_ptr
+#else /* UNICODE */
+#  define __itt_metadata_str_add_with_scope     __itt_metadata_str_add_with_scopeA
+#  define __itt_metadata_str_add_with_scope_ptr __itt_metadata_str_add_with_scopeA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_metadata_str_add_with_scope(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length);
+#endif
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeA, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length))
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeW, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const wchar_t *data, size_t length))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scope, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_add_with_scopeA(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_add_with_scopeA,d,x,y,z,a)
+#define __itt_metadata_str_add_with_scopeA_ptr        ITTNOTIFY_NAME(metadata_str_add_with_scopeA)
+#define __itt_metadata_str_add_with_scopeW(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_add_with_scopeW,d,x,y,z,a)
+#define __itt_metadata_str_add_with_scopeW_ptr        ITTNOTIFY_NAME(metadata_str_add_with_scopeW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add_with_scope(d,x,y,z,a)  ITTNOTIFY_VOID_D4(metadata_str_add_with_scope,d,x,y,z,a)
+#define __itt_metadata_str_add_with_scope_ptr         ITTNOTIFY_NAME(metadata_str_add_with_scope)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_add_with_scopeA(d,x,y,z,a)
+#define __itt_metadata_str_add_with_scopeA_ptr  0
+#define __itt_metadata_str_add_with_scopeW(d,x,y,z,a)
+#define __itt_metadata_str_add_with_scopeW_ptr  0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add_with_scope(d,x,y,z,a)
+#define __itt_metadata_str_add_with_scope_ptr   0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_add_with_scopeA_ptr  0
+#define __itt_metadata_str_add_with_scopeW_ptr  0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add_with_scope_ptr   0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @} metadata group */
+
+/**
+ * @defgroup relations Relations
+ * Instances of named entities can be explicitly associated with other
+ * instances using instance IDs and the relationship API calls.
+ *
+ * @{
+ */
+
+/**
+ * @ingroup relations
+ * @brief The kind of relation between two instances is specified by the enumerated type __itt_relation.
+ * Relations between instances can be added with an API call. The relation
+ * API uses instance IDs. Relations can be added before or after the actual
+ * instances are created and persist independently of the instances. This
+ * is the motivation for having different lifetimes for instance IDs and
+ * the actual instances.
+ */
+typedef enum
+{
+    __itt_relation_is_unknown = 0,
+    __itt_relation_is_dependent_on,         /**< "A is dependent on B" means that A cannot start until B completes */
+    __itt_relation_is_sibling_of,           /**< "A is sibling of B" means that A and B were created as a group */
+    __itt_relation_is_parent_of,            /**< "A is parent of B" means that A created B */
+    __itt_relation_is_continuation_of,      /**< "A is continuation of B" means that A assumes the dependencies of B */
+    __itt_relation_is_child_of,             /**< "A is child of B" means that A was created by B (inverse of is_parent_of) */
+    __itt_relation_is_continued_by,         /**< "A is continued by B" means that B assumes the dependencies of A (inverse of is_continuation_of) */
+    __itt_relation_is_predecessor_to        /**< "A is predecessor to B" means that B cannot start until A completes (inverse of is_dependent_on) */
+} __itt_relation;
+
+/**
+ * @ingroup relations
+ * @brief Add a relation to the current task instance.
+ * The current task instance is the head of the relation.
+ * @param[in] domain The domain controlling this call
+ * @param[in] relation The kind of relation
+ * @param[in] tail The ID for the tail of the relation
+ */
+void ITTAPI __itt_relation_add_to_current(const __itt_domain *domain, __itt_relation relation, __itt_id tail);
+
+/**
+ * @ingroup relations
+ * @brief Add a relation between two instance identifiers.
+ * @param[in] domain The domain controlling this call
+ * @param[in] head The ID for the head of the relation
+ * @param[in] relation The kind of relation
+ * @param[in] tail The ID for the tail of the relation
+ */
+void ITTAPI __itt_relation_add(const __itt_domain *domain, __itt_id head, __itt_relation relation, __itt_id tail);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, relation_add_to_current, (const __itt_domain *domain, __itt_relation relation, __itt_id tail))
+ITT_STUBV(ITTAPI, void, relation_add,            (const __itt_domain *domain, __itt_id head, __itt_relation relation, __itt_id tail))
+#define __itt_relation_add_to_current(d,x,y) ITTNOTIFY_VOID_D2(relation_add_to_current,d,x,y)
+#define __itt_relation_add_to_current_ptr    ITTNOTIFY_NAME(relation_add_to_current)
+#define __itt_relation_add(d,x,y,z)          ITTNOTIFY_VOID_D3(relation_add,d,x,y,z)
+#define __itt_relation_add_ptr               ITTNOTIFY_NAME(relation_add)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_relation_add_to_current(d,x,y)
+#define __itt_relation_add_to_current_ptr 0
+#define __itt_relation_add(d,x,y,z)
+#define __itt_relation_add_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_relation_add_to_current_ptr 0
+#define __itt_relation_add_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} relations group */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_clock_info
+{
+    unsigned long long clock_freq; /*!< Clock domain frequency */
+    unsigned long long clock_base; /*!< Clock domain base timestamp */
+} __itt_clock_info;
+
+#pragma pack(pop)
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+typedef void (ITTAPI *__itt_get_clock_info_fn)(__itt_clock_info* clock_info, void* data);
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_clock_domain
+{
+    __itt_clock_info info;      /*!< Most recent clock domain info */
+    __itt_get_clock_info_fn fn; /*!< Callback function pointer */
+    void* fn_data;              /*!< Input argument for the callback function */
+    int   extra1;               /*!< Reserved. Must be zero */
+    void* extra2;               /*!< Reserved. Must be zero */
+    struct ___itt_clock_domain* next;
+} __itt_clock_domain;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @ingroup clockdomains
+ * @brief Create a clock domain.
+ * Certain applications require the capability to trace their application using
+ * a clock domain different than the CPU, for instance the instrumentation of events
+ * that occur on a GPU.
+ * Because the set of domains is expected to be static over the application's execution time,
+ * there is no mechanism to destroy a domain.
+ * Any domain can be accessed by any thread in the process, regardless of which thread created
+ * the domain. This call is thread-safe.
+ * @param[in] fn A pointer to a callback function which retrieves alternative CPU timestamps
+ * @param[in] fn_data Argument for a callback function; may be NULL
+ */
+__itt_clock_domain* ITTAPI __itt_clock_domain_create(__itt_get_clock_info_fn fn, void* fn_data);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_clock_domain*, clock_domain_create, (__itt_get_clock_info_fn fn, void* fn_data))
+#define __itt_clock_domain_create     ITTNOTIFY_DATA(clock_domain_create)
+#define __itt_clock_domain_create_ptr ITTNOTIFY_NAME(clock_domain_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_clock_domain_create(fn,fn_data) (__itt_clock_domain*)0
+#define __itt_clock_domain_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_clock_domain_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup clockdomains
+ * @brief Recalculate clock domains frequencies and clock base timestamps.
+ */
+void ITTAPI __itt_clock_domain_reset(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, clock_domain_reset, (void))
+#define __itt_clock_domain_reset     ITTNOTIFY_VOID(clock_domain_reset)
+#define __itt_clock_domain_reset_ptr ITTNOTIFY_NAME(clock_domain_reset)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_clock_domain_reset()
+#define __itt_clock_domain_reset_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_clock_domain_reset_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup clockdomain
+ * @brief Create an instance of identifier. This establishes the beginning of the lifetime of
+ * an instance of the given ID in the trace. Once this lifetime starts, the ID can be used to
+ * tag named entity instances in calls such as __itt_task_begin, and to specify relationships among
+ * identified named entity instances, using the \ref relations APIs.
+ * @param[in] domain The domain controlling the execution of this call.
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] id The ID to create.
+ */
+void ITTAPI __itt_id_create_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id);
+
+/**
+ * @ingroup clockdomain
+ * @brief Destroy an instance of identifier. This ends the lifetime of the current instance of the
+ * given ID value in the trace. Any relationships that are established after this lifetime ends are
+ * invalid. This call must be performed before the given ID value can be reused for a different
+ * named entity instance.
+ * @param[in] domain The domain controlling the execution of this call.
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] id The ID to destroy.
+ */
+void ITTAPI __itt_id_destroy_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, id_create_ex,  (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id))
+ITT_STUBV(ITTAPI, void, id_destroy_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id))
+#define __itt_id_create_ex(d,x,y,z)  ITTNOTIFY_VOID_D3(id_create_ex,d,x,y,z)
+#define __itt_id_create_ex_ptr       ITTNOTIFY_NAME(id_create_ex)
+#define __itt_id_destroy_ex(d,x,y,z) ITTNOTIFY_VOID_D3(id_destroy_ex,d,x,y,z)
+#define __itt_id_destroy_ex_ptr      ITTNOTIFY_NAME(id_destroy_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_id_create_ex(domain,clock_domain,timestamp,id)
+#define __itt_id_create_ex_ptr    0
+#define __itt_id_destroy_ex(domain,clock_domain,timestamp,id)
+#define __itt_id_destroy_ex_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_id_create_ex_ptr    0
+#define __itt_id_destroy_ex_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup clockdomain
+ * @brief Begin a task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] taskid The instance ID for this task instance, or __itt_null
+ * @param[in] parentid The parent instance to which this task instance belongs, or __itt_null
+ * @param[in] name The name of this task
+ */
+void ITTAPI __itt_task_begin_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name);
+
+/**
+ * @ingroup clockdomain
+ * @brief Begin a task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] taskid The identifier for this task instance, or __itt_null
+ * @param[in] parentid The parent of this task, or __itt_null
+ * @param[in] fn The pointer to the function you are tracing
+ */
+void ITTAPI __itt_task_begin_fn_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, void* fn);
+
+/**
+ * @ingroup clockdomain
+ * @brief End the current task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ */
+void ITTAPI __itt_task_end_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, task_begin_ex,        (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, task_begin_fn_ex,     (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, void* fn))
+ITT_STUBV(ITTAPI, void, task_end_ex,          (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp))
+#define __itt_task_begin_ex(d,x,y,z,a,b)      ITTNOTIFY_VOID_D5(task_begin_ex,d,x,y,z,a,b)
+#define __itt_task_begin_ex_ptr               ITTNOTIFY_NAME(task_begin_ex)
+#define __itt_task_begin_fn_ex(d,x,y,z,a,b)   ITTNOTIFY_VOID_D5(task_begin_fn_ex,d,x,y,z,a,b)
+#define __itt_task_begin_fn_ex_ptr            ITTNOTIFY_NAME(task_begin_fn_ex)
+#define __itt_task_end_ex(d,x,y)              ITTNOTIFY_VOID_D2(task_end_ex,d,x,y)
+#define __itt_task_end_ex_ptr                 ITTNOTIFY_NAME(task_end_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_begin_ex(domain,clock_domain,timestamp,id,parentid,name)
+#define __itt_task_begin_ex_ptr          0
+#define __itt_task_begin_fn_ex(domain,clock_domain,timestamp,id,parentid,fn)
+#define __itt_task_begin_fn_ex_ptr       0
+#define __itt_task_end_ex(domain,clock_domain,timestamp)
+#define __itt_task_end_ex_ptr            0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_task_begin_ex_ptr          0
+#define __itt_task_begin_fn_ex_ptr       0
+#define __itt_task_end_ex_ptr            0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @defgroup counters Counters
+ * @ingroup public
+ * Counters are user-defined objects with a monotonically increasing
+ * value. Counter values are 64-bit unsigned integers.
+ * Counters have names that can be displayed in
+ * the tools.
+ * @{
+ */
+
+/**
+ * @brief opaque structure for counter identification
+ */
+/** @cond exclude_from_documentation */
+
+typedef struct ___itt_counter* __itt_counter;
+
+/**
+ * @brief Create an unsigned 64 bits integer counter with given name/domain
+ *
+ * After __itt_counter_create() is called, __itt_counter_inc(id), __itt_counter_inc_delta(id, delta),
+ * __itt_counter_set_value(id, value_ptr) or __itt_counter_set_value_ex(id, clock_domain, timestamp, value_ptr)
+ * can be used to change the value of the counter, where value_ptr is a pointer to an unsigned 64 bits integer
+ *
+ * The call is equal to __itt_counter_create_typed(name, domain, __itt_metadata_u64)
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_counter ITTAPI __itt_counter_createA(const char    *name, const char    *domain);
+__itt_counter ITTAPI __itt_counter_createW(const wchar_t *name, const wchar_t *domain);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_counter_create     __itt_counter_createW
+#  define __itt_counter_create_ptr __itt_counter_createW_ptr
+#else /* UNICODE */
+#  define __itt_counter_create     __itt_counter_createA
+#  define __itt_counter_create_ptr __itt_counter_createA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_counter ITTAPI __itt_counter_create(const char *name, const char *domain);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_counter, counter_createA, (const char    *name, const char    *domain))
+ITT_STUB(ITTAPI, __itt_counter, counter_createW, (const wchar_t *name, const wchar_t *domain))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_counter, counter_create,  (const char *name, const char *domain))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_createA     ITTNOTIFY_DATA(counter_createA)
+#define __itt_counter_createA_ptr ITTNOTIFY_NAME(counter_createA)
+#define __itt_counter_createW     ITTNOTIFY_DATA(counter_createW)
+#define __itt_counter_createW_ptr ITTNOTIFY_NAME(counter_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create     ITTNOTIFY_DATA(counter_create)
+#define __itt_counter_create_ptr ITTNOTIFY_NAME(counter_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_createA(name, domain)
+#define __itt_counter_createA_ptr 0
+#define __itt_counter_createW(name, domain)
+#define __itt_counter_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create(name, domain)
+#define __itt_counter_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_createA_ptr 0
+#define __itt_counter_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Increment the unsigned 64 bits integer counter value
+ *
+ * Calling this function to non-unsigned 64 bits integer counters has no effect
+ */
+void ITTAPI __itt_counter_inc(__itt_counter id);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_inc, (__itt_counter id))
+#define __itt_counter_inc     ITTNOTIFY_VOID(counter_inc)
+#define __itt_counter_inc_ptr ITTNOTIFY_NAME(counter_inc)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_inc(id)
+#define __itt_counter_inc_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_inc_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/**
+ * @brief Increment the unsigned 64 bits integer counter value with x
+ *
+ * Calling this function to non-unsigned 64 bits integer counters has no effect
+ */
+void ITTAPI __itt_counter_inc_delta(__itt_counter id, unsigned long long value);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_inc_delta, (__itt_counter id, unsigned long long value))
+#define __itt_counter_inc_delta     ITTNOTIFY_VOID(counter_inc_delta)
+#define __itt_counter_inc_delta_ptr ITTNOTIFY_NAME(counter_inc_delta)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_inc_delta(id, value)
+#define __itt_counter_inc_delta_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_inc_delta_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Decrement the unsigned 64 bits integer counter value
+ *
+ * Calling this function to non-unsigned 64 bits integer counters has no effect
+ */
+void ITTAPI __itt_counter_dec(__itt_counter id);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_dec, (__itt_counter id))
+#define __itt_counter_dec     ITTNOTIFY_VOID(counter_dec)
+#define __itt_counter_dec_ptr ITTNOTIFY_NAME(counter_dec)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_dec(id)
+#define __itt_counter_dec_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_dec_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/**
+ * @brief Decrement the unsigned 64 bits integer counter value with x
+ *
+ * Calling this function to non-unsigned 64 bits integer counters has no effect
+ */
+void ITTAPI __itt_counter_dec_delta(__itt_counter id, unsigned long long value);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_dec_delta, (__itt_counter id, unsigned long long value))
+#define __itt_counter_dec_delta     ITTNOTIFY_VOID(counter_dec_delta)
+#define __itt_counter_dec_delta_ptr ITTNOTIFY_NAME(counter_dec_delta)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_dec_delta(id, value)
+#define __itt_counter_dec_delta_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_dec_delta_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup counters
+ * @brief Increment a counter by one.
+ * The first call with a given name creates a counter by that name and sets its
+ * value to zero. Successive calls increment the counter value.
+ * @param[in] domain The domain controlling the call. Counter names are not domain specific.
+ *            The domain argument is used only to enable or disable the API calls.
+ * @param[in] name The name of the counter
+ */
+void ITTAPI __itt_counter_inc_v3(const __itt_domain *domain, __itt_string_handle *name);
+
+/**
+ * @ingroup counters
+ * @brief Increment a counter by the value specified in delta.
+ * @param[in] domain The domain controlling the call. Counter names are not domain specific.
+ *            The domain argument is used only to enable or disable the API calls.
+ * @param[in] name The name of the counter
+ * @param[in] delta The amount by which to increment the counter
+ */
+void ITTAPI __itt_counter_inc_delta_v3(const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_inc_v3,       (const __itt_domain *domain, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, counter_inc_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta))
+#define __itt_counter_inc_v3(d,x)         ITTNOTIFY_VOID_D1(counter_inc_v3,d,x)
+#define __itt_counter_inc_v3_ptr          ITTNOTIFY_NAME(counter_inc_v3)
+#define __itt_counter_inc_delta_v3(d,x,y) ITTNOTIFY_VOID_D2(counter_inc_delta_v3,d,x,y)
+#define __itt_counter_inc_delta_v3_ptr    ITTNOTIFY_NAME(counter_inc_delta_v3)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_inc_v3(domain,name)
+#define __itt_counter_inc_v3_ptr       0
+#define __itt_counter_inc_delta_v3(domain,name,delta)
+#define __itt_counter_inc_delta_v3_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_inc_v3_ptr       0
+#define __itt_counter_inc_delta_v3_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+
+/**
+ * @ingroup counters
+ * @brief Decrement a counter by one.
+ * The first call with a given name creates a counter by that name and sets its
+ * value to zero. Successive calls decrement the counter value.
+ * @param[in] domain The domain controlling the call. Counter names are not domain specific.
+ *            The domain argument is used only to enable or disable the API calls.
+ * @param[in] name The name of the counter
+ */
+void ITTAPI __itt_counter_dec_v3(const __itt_domain *domain, __itt_string_handle *name);
+
+/**
+ * @ingroup counters
+ * @brief Decrement a counter by the value specified in delta.
+ * @param[in] domain The domain controlling the call. Counter names are not domain specific.
+ *            The domain argument is used only to enable or disable the API calls.
+ * @param[in] name The name of the counter
+ * @param[in] delta The amount by which to decrement the counter
+ */
+void ITTAPI __itt_counter_dec_delta_v3(const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_dec_v3,       (const __itt_domain *domain, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, counter_dec_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta))
+#define __itt_counter_dec_v3(d,x)         ITTNOTIFY_VOID_D1(counter_dec_v3,d,x)
+#define __itt_counter_dec_v3_ptr          ITTNOTIFY_NAME(counter_dec_v3)
+#define __itt_counter_dec_delta_v3(d,x,y) ITTNOTIFY_VOID_D2(counter_dec_delta_v3,d,x,y)
+#define __itt_counter_dec_delta_v3_ptr    ITTNOTIFY_NAME(counter_dec_delta_v3)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_dec_v3(domain,name)
+#define __itt_counter_dec_v3_ptr       0
+#define __itt_counter_dec_delta_v3(domain,name,delta)
+#define __itt_counter_dec_delta_v3_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_dec_v3_ptr       0
+#define __itt_counter_dec_delta_v3_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @} counters group */
+
+
+/**
+ * @brief Set the counter value
+ */
+void ITTAPI __itt_counter_set_value(__itt_counter id, void *value_ptr);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_set_value, (__itt_counter id, void *value_ptr))
+#define __itt_counter_set_value     ITTNOTIFY_VOID(counter_set_value)
+#define __itt_counter_set_value_ptr ITTNOTIFY_NAME(counter_set_value)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_set_value(id, value_ptr)
+#define __itt_counter_set_value_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_set_value_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Set the counter value
+ */
+void ITTAPI __itt_counter_set_value_ex(__itt_counter id, __itt_clock_domain *clock_domain, unsigned long long timestamp, void *value_ptr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_set_value_ex, (__itt_counter id, __itt_clock_domain *clock_domain, unsigned long long timestamp, void *value_ptr))
+#define __itt_counter_set_value_ex     ITTNOTIFY_VOID(counter_set_value_ex)
+#define __itt_counter_set_value_ex_ptr ITTNOTIFY_NAME(counter_set_value_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_set_value_ex(id, clock_domain, timestamp, value_ptr)
+#define __itt_counter_set_value_ex_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_set_value_ex_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Create a typed counter with given name/domain
+ *
+ * After __itt_counter_create_typed() is called, __itt_counter_inc(id), __itt_counter_inc_delta(id, delta),
+ * __itt_counter_set_value(id, value_ptr) or __itt_counter_set_value_ex(id, clock_domain, timestamp, value_ptr)
+ * can be used to change the value of the counter
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_counter ITTAPI __itt_counter_create_typedA(const char    *name, const char    *domain, __itt_metadata_type type);
+__itt_counter ITTAPI __itt_counter_create_typedW(const wchar_t *name, const wchar_t *domain, __itt_metadata_type type);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_counter_create_typed     __itt_counter_create_typedW
+#  define __itt_counter_create_typed_ptr __itt_counter_create_typedW_ptr
+#else /* UNICODE */
+#  define __itt_counter_create_typed     __itt_counter_create_typedA
+#  define __itt_counter_create_typed_ptr __itt_counter_create_typedA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_counter ITTAPI __itt_counter_create_typed(const char *name, const char *domain, __itt_metadata_type type);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typedA, (const char    *name, const char    *domain, __itt_metadata_type type))
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typedW, (const wchar_t *name, const wchar_t *domain, __itt_metadata_type type))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typed,  (const char *name, const char *domain, __itt_metadata_type type))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_create_typedA     ITTNOTIFY_DATA(counter_create_typedA)
+#define __itt_counter_create_typedA_ptr ITTNOTIFY_NAME(counter_create_typedA)
+#define __itt_counter_create_typedW     ITTNOTIFY_DATA(counter_create_typedW)
+#define __itt_counter_create_typedW_ptr ITTNOTIFY_NAME(counter_create_typedW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create_typed     ITTNOTIFY_DATA(counter_create_typed)
+#define __itt_counter_create_typed_ptr ITTNOTIFY_NAME(counter_create_typed)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_create_typedA(name, domain, type)
+#define __itt_counter_create_typedA_ptr 0
+#define __itt_counter_create_typedW(name, domain, type)
+#define __itt_counter_create_typedW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create_typed(name, domain, type)
+#define __itt_counter_create_typed_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_create_typedA_ptr 0
+#define __itt_counter_create_typedW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create_typed_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Destroy the counter identified by the pointer previously returned by __itt_counter_create() or
+ * __itt_counter_create_typed()
+ */
+void ITTAPI __itt_counter_destroy(__itt_counter id);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_destroy, (__itt_counter id))
+#define __itt_counter_destroy     ITTNOTIFY_VOID(counter_destroy)
+#define __itt_counter_destroy_ptr ITTNOTIFY_NAME(counter_destroy)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_destroy(id)
+#define __itt_counter_destroy_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_destroy_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} counters group */
+
+/**
+ * @ingroup markers
+ * @brief Create a marker instance.
+ * @param[in] domain The domain for this marker
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] id The instance ID for this marker, or __itt_null
+ * @param[in] name The name for this marker
+ * @param[in] scope The scope for this marker
+ */
+void ITTAPI __itt_marker_ex(const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_string_handle *name, __itt_scope scope);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, marker_ex,    (const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_string_handle *name, __itt_scope scope))
+#define __itt_marker_ex(d,x,y,z,a,b)    ITTNOTIFY_VOID_D5(marker_ex,d,x,y,z,a,b)
+#define __itt_marker_ex_ptr             ITTNOTIFY_NAME(marker_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_marker_ex(domain,clock_domain,timestamp,id,name,scope)
+#define __itt_marker_ex_ptr    0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_marker_ex_ptr    0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup clockdomain
+ * @brief Add a relation to the current task instance.
+ * The current task instance is the head of the relation.
+ * @param[in] domain The domain controlling this call
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] relation The kind of relation
+ * @param[in] tail The ID for the tail of the relation
+ */
+void ITTAPI __itt_relation_add_to_current_ex(const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_relation relation, __itt_id tail);
+
+/**
+ * @ingroup clockdomain
+ * @brief Add a relation between two instance identifiers.
+ * @param[in] domain The domain controlling this call
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] head The ID for the head of the relation
+ * @param[in] relation The kind of relation
+ * @param[in] tail The ID for the tail of the relation
+ */
+void ITTAPI __itt_relation_add_ex(const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id head, __itt_relation relation, __itt_id tail);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, relation_add_to_current_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_relation relation, __itt_id tail))
+ITT_STUBV(ITTAPI, void, relation_add_ex,            (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id head, __itt_relation relation, __itt_id tail))
+#define __itt_relation_add_to_current_ex(d,x,y,z,a) ITTNOTIFY_VOID_D4(relation_add_to_current_ex,d,x,y,z,a)
+#define __itt_relation_add_to_current_ex_ptr        ITTNOTIFY_NAME(relation_add_to_current_ex)
+#define __itt_relation_add_ex(d,x,y,z,a,b)          ITTNOTIFY_VOID_D5(relation_add_ex,d,x,y,z,a,b)
+#define __itt_relation_add_ex_ptr                   ITTNOTIFY_NAME(relation_add_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_relation_add_to_current_ex(domain,clock_domain,timestame,relation,tail)
+#define __itt_relation_add_to_current_ex_ptr 0
+#define __itt_relation_add_ex(domain,clock_domain,timestamp,head,relation,tail)
+#define __itt_relation_add_ex_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_relation_add_to_current_ex_ptr 0
+#define __itt_relation_add_ex_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+typedef enum ___itt_track_group_type
+{
+    __itt_track_group_type_normal = 0
+} __itt_track_group_type;
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_track_group
+{
+    __itt_string_handle* name;     /*!< Name of the track group */
+    struct ___itt_track* track;    /*!< List of child tracks    */
+    __itt_track_group_type tgtype; /*!< Type of the track group */
+    int   extra1;                  /*!< Reserved. Must be zero  */
+    void* extra2;                  /*!< Reserved. Must be zero  */
+    struct ___itt_track_group* next;
+} __itt_track_group;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @brief Placeholder for custom track types. Currently, "normal" custom track
+ * is the only available track type.
+ */
+typedef enum ___itt_track_type
+{
+    __itt_track_type_normal = 0
+#ifdef INTEL_ITTNOTIFY_API_PRIVATE
+    , __itt_track_type_queue
+#endif /* INTEL_ITTNOTIFY_API_PRIVATE */
+} __itt_track_type;
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_track
+{
+    __itt_string_handle* name; /*!< Name of the track group */
+    __itt_track_group* group;  /*!< Parent group to a track */
+    __itt_track_type ttype;    /*!< Type of the track       */
+    int   extra1;              /*!< Reserved. Must be zero  */
+    void* extra2;              /*!< Reserved. Must be zero  */
+    struct ___itt_track* next;
+} __itt_track;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @brief Create logical track group.
+ */
+__itt_track_group* ITTAPI __itt_track_group_create(__itt_string_handle* name, __itt_track_group_type track_group_type);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_track_group*, track_group_create, (__itt_string_handle* name, __itt_track_group_type track_group_type))
+#define __itt_track_group_create     ITTNOTIFY_DATA(track_group_create)
+#define __itt_track_group_create_ptr ITTNOTIFY_NAME(track_group_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_track_group_create(name)  (__itt_track_group*)0
+#define __itt_track_group_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_track_group_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Create logical track.
+ */
+__itt_track* ITTAPI __itt_track_create(__itt_track_group* track_group, __itt_string_handle* name, __itt_track_type track_type);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_track*, track_create, (__itt_track_group* track_group,__itt_string_handle* name, __itt_track_type track_type))
+#define __itt_track_create     ITTNOTIFY_DATA(track_create)
+#define __itt_track_create_ptr ITTNOTIFY_NAME(track_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_track_create(track_group,name,track_type)  (__itt_track*)0
+#define __itt_track_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_track_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Set the logical track.
+ */
+void ITTAPI __itt_set_track(__itt_track* track);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, set_track, (__itt_track *track))
+#define __itt_set_track     ITTNOTIFY_VOID(set_track)
+#define __itt_set_track_ptr ITTNOTIFY_NAME(set_track)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_set_track(track)
+#define __itt_set_track_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_set_track_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/* ========================================================================== */
+/** @cond exclude_from_gpa_documentation */
+/**
+ * @defgroup events Events
+ * @ingroup public
+ * Events group
+ * @{
+ */
+/** @brief user event type */
+typedef int __itt_event;
+
+/**
+ * @brief Create an event notification
+ * @note name or namelen being null/name and namelen not matching, user event feature not enabled
+ * @return non-zero event identifier upon success and __itt_err otherwise
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_event LIBITTAPI __itt_event_createA(const char    *name, int namelen);
+__itt_event LIBITTAPI __itt_event_createW(const wchar_t *name, int namelen);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_event_create     __itt_event_createW
+#  define __itt_event_create_ptr __itt_event_createW_ptr
+#else
+#  define __itt_event_create     __itt_event_createA
+#  define __itt_event_create_ptr __itt_event_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_event LIBITTAPI __itt_event_create(const char *name, int namelen);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, __itt_event, event_createA, (const char    *name, int namelen))
+ITT_STUB(LIBITTAPI, __itt_event, event_createW, (const wchar_t *name, int namelen))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, __itt_event, event_create,  (const char    *name, int namelen))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA     ITTNOTIFY_DATA(event_createA)
+#define __itt_event_createA_ptr ITTNOTIFY_NAME(event_createA)
+#define __itt_event_createW     ITTNOTIFY_DATA(event_createW)
+#define __itt_event_createW_ptr ITTNOTIFY_NAME(event_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create      ITTNOTIFY_DATA(event_create)
+#define __itt_event_create_ptr  ITTNOTIFY_NAME(event_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA(name, namelen) (__itt_event)0
+#define __itt_event_createA_ptr 0
+#define __itt_event_createW(name, namelen) (__itt_event)0
+#define __itt_event_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create(name, namelen)  (__itt_event)0
+#define __itt_event_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA_ptr 0
+#define __itt_event_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an event occurrence.
+ * @return __itt_err upon failure (invalid event id/user event feature not enabled)
+ */
+int LIBITTAPI __itt_event_start(__itt_event event);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(LIBITTAPI, int, event_start, (__itt_event event))
+#define __itt_event_start     ITTNOTIFY_DATA(event_start)
+#define __itt_event_start_ptr ITTNOTIFY_NAME(event_start)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_event_start(event) (int)0
+#define __itt_event_start_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_event_start_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an event end occurrence.
+ * @note It is optional if events do not have durations.
+ * @return __itt_err upon failure (invalid event id/user event feature not enabled)
+ */
+int LIBITTAPI __itt_event_end(__itt_event event);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(LIBITTAPI, int, event_end, (__itt_event event))
+#define __itt_event_end     ITTNOTIFY_DATA(event_end)
+#define __itt_event_end_ptr ITTNOTIFY_NAME(event_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_event_end(event) (int)0
+#define __itt_event_end_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_event_end_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} events group */
+
+
+/**
+ * @defgroup arrays Arrays Visualizer
+ * @ingroup public
+ * Visualize arrays
+ * @{
+ */
+
+/**
+ * @enum __itt_av_data_type
+ * @brief Defines types of arrays data (for C/C++ intrinsic types)
+ */
+typedef enum
+{
+    __itt_e_first = 0,
+    __itt_e_char = 0,  /* 1-byte integer */
+    __itt_e_uchar,     /* 1-byte unsigned integer */
+    __itt_e_int16,     /* 2-byte integer */
+    __itt_e_uint16,    /* 2-byte unsigned integer  */
+    __itt_e_int32,     /* 4-byte integer */
+    __itt_e_uint32,    /* 4-byte unsigned integer */
+    __itt_e_int64,     /* 8-byte integer */
+    __itt_e_uint64,    /* 8-byte unsigned integer */
+    __itt_e_float,     /* 4-byte floating */
+    __itt_e_double,    /* 8-byte floating */
+    __itt_e_last = __itt_e_double
+} __itt_av_data_type;
+
+/**
+ * @brief Save an array data to a file.
+ * Output format is defined by the file extension. The csv and bmp formats are supported (bmp - for 2-dimensional array only).
+ * @param[in] data - pointer to the array data
+ * @param[in] rank - the rank of the array
+ * @param[in] dimensions - pointer to an array of integers, which specifies the array dimensions.
+ * The size of dimensions must be equal to the rank
+ * @param[in] type - the type of the array, specified as one of the __itt_av_data_type values (for intrinsic types)
+ * @param[in] filePath - the file path; the output format is defined by the file extension
+ * @param[in] columnOrder - defines how the array is stored in the linear memory.
+ * It should be 1 for column-major order (e.g. in FORTRAN) or 0 - for row-major order (e.g. in C).
+ */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+int ITTAPI __itt_av_saveA(void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder);
+int ITTAPI __itt_av_saveW(void *data, int rank, const int *dimensions, int type, const wchar_t *filePath, int columnOrder);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_av_save     __itt_av_saveW
+#  define __itt_av_save_ptr __itt_av_saveW_ptr
+#else /* UNICODE */
+#  define __itt_av_save     __itt_av_saveA
+#  define __itt_av_save_ptr __itt_av_saveA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int ITTAPI __itt_av_save(void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, av_saveA, (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder))
+ITT_STUB(ITTAPI, int, av_saveW, (void *data, int rank, const int *dimensions, int type, const wchar_t *filePath, int columnOrder))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, av_save,  (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_av_saveA     ITTNOTIFY_DATA(av_saveA)
+#define __itt_av_saveA_ptr ITTNOTIFY_NAME(av_saveA)
+#define __itt_av_saveW     ITTNOTIFY_DATA(av_saveW)
+#define __itt_av_saveW_ptr ITTNOTIFY_NAME(av_saveW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_av_save     ITTNOTIFY_DATA(av_save)
+#define __itt_av_save_ptr ITTNOTIFY_NAME(av_save)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_av_saveA(name)
+#define __itt_av_saveA_ptr 0
+#define __itt_av_saveW(name)
+#define __itt_av_saveW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_av_save(name)
+#define __itt_av_save_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_av_saveA_ptr 0
+#define __itt_av_saveW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_av_save_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+void ITTAPI __itt_enable_attach(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, enable_attach, (void))
+#define __itt_enable_attach     ITTNOTIFY_VOID(enable_attach)
+#define __itt_enable_attach_ptr ITTNOTIFY_NAME(enable_attach)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_enable_attach()
+#define __itt_enable_attach_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_enable_attach_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @cond exclude_from_gpa_documentation */
+
+/** @} arrays group */
+
+/** @endcond */
+
+/**
+ * @brief Module load info
+ * This API is used to report necessary information in case of module relocation
+ * @param[in] start_addr - relocated module start address
+ * @param[in] end_addr - relocated module end address
+ * @param[in] path - file system path to the module
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_module_loadA(void *start_addr, void *end_addr, const char *path);
+void ITTAPI __itt_module_loadW(void *start_addr, void *end_addr, const wchar_t *path);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_module_load     __itt_module_loadW
+#  define __itt_module_load_ptr __itt_module_loadW_ptr
+#else /* UNICODE */
+#  define __itt_module_load     __itt_module_loadA
+#  define __itt_module_load_ptr __itt_module_loadA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_module_load(void *start_addr, void *end_addr, const char *path);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, void, module_loadA, (void *start_addr, void *end_addr, const char *path))
+ITT_STUB(ITTAPI, void, module_loadW, (void *start_addr, void *end_addr, const wchar_t *path))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, void, module_load,  (void *start_addr, void *end_addr, const char *path))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_module_loadA     ITTNOTIFY_VOID(module_loadA)
+#define __itt_module_loadA_ptr ITTNOTIFY_NAME(module_loadA)
+#define __itt_module_loadW     ITTNOTIFY_VOID(module_loadW)
+#define __itt_module_loadW_ptr ITTNOTIFY_NAME(module_loadW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_module_load     ITTNOTIFY_VOID(module_load)
+#define __itt_module_load_ptr ITTNOTIFY_NAME(module_load)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_module_loadA(start_addr, end_addr, path)
+#define __itt_module_loadA_ptr 0
+#define __itt_module_loadW(start_addr, end_addr, path)
+#define __itt_module_loadW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_module_load(start_addr, end_addr, path)
+#define __itt_module_load_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_module_loadA_ptr 0
+#define __itt_module_loadW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_module_load_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _ITTNOTIFY_H_ */
+
+#ifdef INTEL_ITTNOTIFY_API_PRIVATE
+
+#ifndef _ITTNOTIFY_PRIVATE_
+#define _ITTNOTIFY_PRIVATE_
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * @ingroup clockdomain
+ * @brief Begin an overlapped task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] taskid The identifier for this task instance, *cannot* be __itt_null.
+ * @param[in] parentid The parent of this task, or __itt_null.
+ * @param[in] name The name of this task.
+ */
+void ITTAPI __itt_task_begin_overlapped_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name);
+
+/**
+ * @ingroup clockdomain
+ * @brief End an overlapped task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] taskid Explicit ID of finished task
+ */
+void ITTAPI __itt_task_end_overlapped_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, task_begin_overlapped_ex,       (const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name))
+ITT_STUBV(ITTAPI, void, task_end_overlapped_ex,         (const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid))
+#define __itt_task_begin_overlapped_ex(d,x,y,z,a,b)     ITTNOTIFY_VOID_D5(task_begin_overlapped_ex,d,x,y,z,a,b)
+#define __itt_task_begin_overlapped_ex_ptr              ITTNOTIFY_NAME(task_begin_overlapped_ex)
+#define __itt_task_end_overlapped_ex(d,x,y,z)           ITTNOTIFY_VOID_D3(task_end_overlapped_ex,d,x,y,z)
+#define __itt_task_end_overlapped_ex_ptr                ITTNOTIFY_NAME(task_end_overlapped_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_begin_overlapped_ex(domain,clock_domain,timestamp,taskid,parentid,name)
+#define __itt_task_begin_overlapped_ex_ptr      0
+#define __itt_task_end_overlapped_ex(domain,clock_domain,timestamp,taskid)
+#define __itt_task_end_overlapped_ex_ptr        0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_task_begin_overlapped_ex_ptr      0
+#define __itt_task_end_overlapped_ptr           0
+#define __itt_task_end_overlapped_ex_ptr        0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @defgroup makrs_internal Marks
+ * @ingroup internal
+ * Marks group
+ * @warning Internal API:
+ *   - It is not shipped to outside of Intel
+ *   - It is delivered to internal Intel teams using e-mail or SVN access only
+ * @{
+ */
+/** @brief user mark type */
+typedef int __itt_mark_type;
+
+/**
+ * @brief Creates a user mark type with the specified name using char or Unicode string.
+ * @param[in] name - name of mark to create
+ * @return Returns a handle to the mark type
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_mark_type ITTAPI __itt_mark_createA(const char    *name);
+__itt_mark_type ITTAPI __itt_mark_createW(const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_mark_create     __itt_mark_createW
+#  define __itt_mark_create_ptr __itt_mark_createW_ptr
+#else /* UNICODE */
+#  define __itt_mark_create     __itt_mark_createA
+#  define __itt_mark_create_ptr __itt_mark_createA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_mark_type ITTAPI __itt_mark_create(const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_mark_type, mark_createA, (const char    *name))
+ITT_STUB(ITTAPI, __itt_mark_type, mark_createW, (const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_mark_type, mark_create,  (const char *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_createA     ITTNOTIFY_DATA(mark_createA)
+#define __itt_mark_createA_ptr ITTNOTIFY_NAME(mark_createA)
+#define __itt_mark_createW     ITTNOTIFY_DATA(mark_createW)
+#define __itt_mark_createW_ptr ITTNOTIFY_NAME(mark_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_create      ITTNOTIFY_DATA(mark_create)
+#define __itt_mark_create_ptr  ITTNOTIFY_NAME(mark_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_createA(name) (__itt_mark_type)0
+#define __itt_mark_createA_ptr 0
+#define __itt_mark_createW(name) (__itt_mark_type)0
+#define __itt_mark_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_create(name)  (__itt_mark_type)0
+#define __itt_mark_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_createA_ptr 0
+#define __itt_mark_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Creates a "discrete" user mark type of the specified type and an optional parameter using char or Unicode string.
+ *
+ * - The mark of "discrete" type is placed to collection results in case of success. It appears in overtime view(s) as a special tick sign.
+ * - The call is "synchronous" - function returns after mark is actually added to results.
+ * - This function is useful, for example, to mark different phases of application
+ *   (beginning of the next mark automatically meand end of current region).
+ * - Can be used together with "continuous" marks (see below) at the same collection session
+ * @param[in] mt - mark, created by __itt_mark_create(const char* name) function
+ * @param[in] parameter - string parameter of mark
+ * @return Returns zero value in case of success, non-zero value otherwise.
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+int ITTAPI __itt_markA(__itt_mark_type mt, const char    *parameter);
+int ITTAPI __itt_markW(__itt_mark_type mt, const wchar_t *parameter);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_mark     __itt_markW
+#  define __itt_mark_ptr __itt_markW_ptr
+#else /* UNICODE  */
+#  define __itt_mark     __itt_markA
+#  define __itt_mark_ptr __itt_markA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int ITTAPI __itt_mark(__itt_mark_type mt, const char *parameter);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, markA, (__itt_mark_type mt, const char    *parameter))
+ITT_STUB(ITTAPI, int, markW, (__itt_mark_type mt, const wchar_t *parameter))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, mark,  (__itt_mark_type mt, const char *parameter))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_markA     ITTNOTIFY_DATA(markA)
+#define __itt_markA_ptr ITTNOTIFY_NAME(markA)
+#define __itt_markW     ITTNOTIFY_DATA(markW)
+#define __itt_markW_ptr ITTNOTIFY_NAME(markW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark      ITTNOTIFY_DATA(mark)
+#define __itt_mark_ptr  ITTNOTIFY_NAME(mark)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_markA(mt, parameter) (int)0
+#define __itt_markA_ptr 0
+#define __itt_markW(mt, parameter) (int)0
+#define __itt_markW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark(mt, parameter)  (int)0
+#define __itt_mark_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_markA_ptr 0
+#define __itt_markW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Use this if necessary to create a "discrete" user event type (mark) for process
+ * rather then for one thread
+ * @see int __itt_mark(__itt_mark_type mt, const char* parameter);
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+int ITTAPI __itt_mark_globalA(__itt_mark_type mt, const char    *parameter);
+int ITTAPI __itt_mark_globalW(__itt_mark_type mt, const wchar_t *parameter);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_mark_global     __itt_mark_globalW
+#  define __itt_mark_global_ptr __itt_mark_globalW_ptr
+#else /* UNICODE  */
+#  define __itt_mark_global     __itt_mark_globalA
+#  define __itt_mark_global_ptr __itt_mark_globalA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int ITTAPI __itt_mark_global(__itt_mark_type mt, const char *parameter);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, mark_globalA, (__itt_mark_type mt, const char    *parameter))
+ITT_STUB(ITTAPI, int, mark_globalW, (__itt_mark_type mt, const wchar_t *parameter))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, mark_global,  (__itt_mark_type mt, const char *parameter))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_globalA     ITTNOTIFY_DATA(mark_globalA)
+#define __itt_mark_globalA_ptr ITTNOTIFY_NAME(mark_globalA)
+#define __itt_mark_globalW     ITTNOTIFY_DATA(mark_globalW)
+#define __itt_mark_globalW_ptr ITTNOTIFY_NAME(mark_globalW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_global      ITTNOTIFY_DATA(mark_global)
+#define __itt_mark_global_ptr  ITTNOTIFY_NAME(mark_global)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_globalA(mt, parameter) (int)0
+#define __itt_mark_globalA_ptr 0
+#define __itt_mark_globalW(mt, parameter) (int)0
+#define __itt_mark_globalW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_global(mt, parameter)  (int)0
+#define __itt_mark_global_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_globalA_ptr 0
+#define __itt_mark_globalW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_global_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Creates an "end" point for "continuous" mark with specified name.
+ *
+ * - Returns zero value in case of success, non-zero value otherwise.
+ *   Also returns non-zero value when preceding "begin" point for the
+ *   mark with the same name failed to be created or not created.
+ * - The mark of "continuous" type is placed to collection results in
+ *   case of success. It appears in overtime view(s) as a special tick
+ *   sign (different from "discrete" mark) together with line from
+ *   corresponding "begin" mark to "end" mark.
+ * @note Continuous marks can overlap and be nested inside each other.
+ * Discrete mark can be nested inside marked region
+ * @param[in] mt - mark, created by __itt_mark_create(const char* name) function
+ * @return Returns zero value in case of success, non-zero value otherwise.
+ */
+int ITTAPI __itt_mark_off(__itt_mark_type mt);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, int, mark_off, (__itt_mark_type mt))
+#define __itt_mark_off     ITTNOTIFY_DATA(mark_off)
+#define __itt_mark_off_ptr ITTNOTIFY_NAME(mark_off)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_mark_off(mt) (int)0
+#define __itt_mark_off_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_mark_off_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Use this if necessary to create an "end" point for mark of process
+ * @see int __itt_mark_off(__itt_mark_type mt);
+ */
+int ITTAPI __itt_mark_global_off(__itt_mark_type mt);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, int, mark_global_off, (__itt_mark_type mt))
+#define __itt_mark_global_off     ITTNOTIFY_DATA(mark_global_off)
+#define __itt_mark_global_off_ptr ITTNOTIFY_NAME(mark_global_off)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_mark_global_off(mt) (int)0
+#define __itt_mark_global_off_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_mark_global_off_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} marks group */
+
+/**
+ * @defgroup counters_internal Counters
+ * @ingroup internal
+ * Counters group
+ * @{
+ */
+
+
+/**
+ * @defgroup stitch Stack Stitching
+ * @ingroup internal
+ * Stack Stitching group
+ * @{
+ */
+/**
+ * @brief opaque structure for counter identification
+ */
+typedef struct ___itt_caller *__itt_caller;
+
+/**
+ * @brief Create the stitch point e.g. a point in call stack where other stacks should be stitched to.
+ * The function returns a unique identifier which is used to match the cut points with corresponding stitch points.
+ */
+__itt_caller ITTAPI __itt_stack_caller_create(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_caller, stack_caller_create, (void))
+#define __itt_stack_caller_create     ITTNOTIFY_DATA(stack_caller_create)
+#define __itt_stack_caller_create_ptr ITTNOTIFY_NAME(stack_caller_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_stack_caller_create() (__itt_caller)0
+#define __itt_stack_caller_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_stack_caller_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Destroy the information about stitch point identified by the pointer previously returned by __itt_stack_caller_create()
+ */
+void ITTAPI __itt_stack_caller_destroy(__itt_caller id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, stack_caller_destroy, (__itt_caller id))
+#define __itt_stack_caller_destroy     ITTNOTIFY_VOID(stack_caller_destroy)
+#define __itt_stack_caller_destroy_ptr ITTNOTIFY_NAME(stack_caller_destroy)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_stack_caller_destroy(id)
+#define __itt_stack_caller_destroy_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_stack_caller_destroy_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Sets the cut point. Stack from each event which occurs after this call will be cut
+ * at the same stack level the function was called and stitched to the corresponding stitch point.
+ */
+void ITTAPI __itt_stack_callee_enter(__itt_caller id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, stack_callee_enter, (__itt_caller id))
+#define __itt_stack_callee_enter     ITTNOTIFY_VOID(stack_callee_enter)
+#define __itt_stack_callee_enter_ptr ITTNOTIFY_NAME(stack_callee_enter)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_stack_callee_enter(id)
+#define __itt_stack_callee_enter_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_stack_callee_enter_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief This function eliminates the cut point which was set by latest __itt_stack_callee_enter().
+ */
+void ITTAPI __itt_stack_callee_leave(__itt_caller id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, stack_callee_leave, (__itt_caller id))
+#define __itt_stack_callee_leave     ITTNOTIFY_VOID(stack_callee_leave)
+#define __itt_stack_callee_leave_ptr ITTNOTIFY_NAME(stack_callee_leave)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_stack_callee_leave(id)
+#define __itt_stack_callee_leave_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_stack_callee_leave_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @} stitch group */
+
+/* ***************************************************************************************************************************** */
+
+#include <stdarg.h>
+
+/** @cond exclude_from_documentation */
+typedef enum __itt_error_code
+{
+    __itt_error_success       = 0, /*!< no error */
+    __itt_error_no_module     = 1, /*!< module can't be loaded */
+    /* %1$s -- library name; win: %2$d -- system error code; unx: %2$s -- system error message. */
+    __itt_error_no_symbol     = 2, /*!< symbol not found */
+    /* %1$s -- library name, %2$s -- symbol name. */
+    __itt_error_unknown_group = 3, /*!< unknown group specified */
+    /* %1$s -- env var name, %2$s -- group name. */
+    __itt_error_cant_read_env = 4, /*!< GetEnvironmentVariable() failed */
+    /* %1$s -- env var name, %2$d -- system error. */
+    __itt_error_env_too_long  = 5, /*!< variable value too long */
+    /* %1$s -- env var name, %2$d -- actual length of the var, %3$d -- max allowed length. */
+    __itt_error_system        = 6  /*!< pthread_mutexattr_init or pthread_mutex_init failed */
+    /* %1$s -- function name, %2$d -- errno. */
+} __itt_error_code;
+
+typedef void (__itt_error_handler_t)(__itt_error_code code, va_list);
+__itt_error_handler_t* __itt_set_error_handler(__itt_error_handler_t*);
+
+const char* ITTAPI __itt_api_version(void);
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#define __itt_error_handler ITT_JOIN(INTEL_ITTNOTIFY_PREFIX, error_handler)
+void __itt_error_handler(__itt_error_code code, va_list args);
+extern const int ITTNOTIFY_NAME(err);
+#define __itt_err ITTNOTIFY_NAME(err)
+ITT_STUB(ITTAPI, const char*, api_version, (void))
+#define __itt_api_version     ITTNOTIFY_DATA(api_version)
+#define __itt_api_version_ptr ITTNOTIFY_NAME(api_version)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_api_version()   (const char*)0
+#define __itt_api_version_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_api_version_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _ITTNOTIFY_PRIVATE_ */
+
+#endif /* INTEL_ITTNOTIFY_API_PRIVATE */
diff --git a/contrib/libs/tbb/src/tbb/tools_api/ittnotify_config.h b/contrib/libs/tbb/src/tbb/tools_api/ittnotify_config.h
new file mode 100644
index 0000000000..c25730d522
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/tools_api/ittnotify_config.h
@@ -0,0 +1,585 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _ITTNOTIFY_CONFIG_H_
+#define _ITTNOTIFY_CONFIG_H_
+
+/** @cond exclude_from_documentation */
+#ifndef ITT_OS_WIN
+#  define ITT_OS_WIN   1
+#endif /* ITT_OS_WIN */
+
+#ifndef ITT_OS_LINUX
+#  define ITT_OS_LINUX 2
+#endif /* ITT_OS_LINUX */
+
+#ifndef ITT_OS_MAC
+#  define ITT_OS_MAC   3
+#endif /* ITT_OS_MAC */
+
+#ifndef ITT_OS_FREEBSD
+#  define ITT_OS_FREEBSD   4
+#endif /* ITT_OS_FREEBSD */
+
+#ifndef ITT_OS
+#  if defined WIN32 || defined _WIN32
+#    define ITT_OS ITT_OS_WIN
+#  elif defined( __APPLE__ ) && defined( __MACH__ )
+#    define ITT_OS ITT_OS_MAC
+#  elif defined( __FreeBSD__ )
+#    define ITT_OS ITT_OS_FREEBSD
+#  else
+#    define ITT_OS ITT_OS_LINUX
+#  endif
+#endif /* ITT_OS */
+
+#ifndef ITT_PLATFORM_WIN
+#  define ITT_PLATFORM_WIN 1
+#endif /* ITT_PLATFORM_WIN */
+
+#ifndef ITT_PLATFORM_POSIX
+#  define ITT_PLATFORM_POSIX 2
+#endif /* ITT_PLATFORM_POSIX */
+
+#ifndef ITT_PLATFORM_MAC
+#  define ITT_PLATFORM_MAC 3
+#endif /* ITT_PLATFORM_MAC */
+
+#ifndef ITT_PLATFORM_FREEBSD
+#  define ITT_PLATFORM_FREEBSD 4
+#endif /* ITT_PLATFORM_FREEBSD */
+
+#ifndef ITT_PLATFORM
+#  if ITT_OS==ITT_OS_WIN
+#    define ITT_PLATFORM ITT_PLATFORM_WIN
+#  elif ITT_OS==ITT_OS_MAC
+#    define ITT_PLATFORM ITT_PLATFORM_MAC
+#  elif ITT_OS==ITT_OS_FREEBSD
+#    define ITT_PLATFORM ITT_PLATFORM_FREEBSD
+#  else
+#    define ITT_PLATFORM ITT_PLATFORM_POSIX
+#  endif
+#endif /* ITT_PLATFORM */
+
+#if defined(_UNICODE) && !defined(UNICODE)
+#define UNICODE
+#endif
+
+#include <stddef.h>
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#include <tchar.h>
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <stdint.h>
+#if defined(UNICODE) || defined(_UNICODE)
+#include <wchar.h>
+#endif /* UNICODE || _UNICODE */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef ITTAPI_CDECL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define ITTAPI_CDECL __cdecl
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__
+#      define ITTAPI_CDECL __attribute__ ((cdecl))
+#    else  /* _M_IX86 || __i386__ */
+#      define ITTAPI_CDECL /* actual only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* ITTAPI_CDECL */
+
+#ifndef STDCALL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define STDCALL __stdcall
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__
+#      define STDCALL __attribute__ ((stdcall))
+#    else  /* _M_IX86 || __i386__ */
+#      define STDCALL /* supported only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* STDCALL */
+
+#define ITTAPI    ITTAPI_CDECL
+#define LIBITTAPI ITTAPI_CDECL
+
+/* TODO: Temporary for compatibility! */
+#define ITTAPI_CALL    ITTAPI_CDECL
+#define LIBITTAPI_CALL ITTAPI_CDECL
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+/* use __forceinline (VC++ specific) */
+#define ITT_INLINE           __forceinline
+#define ITT_INLINE_ATTRIBUTE /* nothing */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/*
+ * Generally, functions are not inlined unless optimization is specified.
+ * For functions declared inline, this attribute inlines the function even
+ * if no optimization level was specified.
+ */
+#ifdef __STRICT_ANSI__
+#define ITT_INLINE           static
+#define ITT_INLINE_ATTRIBUTE __attribute__((unused))
+#else  /* __STRICT_ANSI__ */
+#define ITT_INLINE           static inline
+#define ITT_INLINE_ATTRIBUTE __attribute__((always_inline, unused))
+#endif /* __STRICT_ANSI__ */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/** @endcond */
+
+#ifndef ITT_ARCH_IA32
+#  define ITT_ARCH_IA32  1
+#endif /* ITT_ARCH_IA32 */
+
+#ifndef ITT_ARCH_IA32E
+#  define ITT_ARCH_IA32E 2
+#endif /* ITT_ARCH_IA32E */
+
+#ifndef ITT_ARCH_ARM
+#  define ITT_ARCH_ARM  4
+#endif /* ITT_ARCH_ARM */
+
+#ifndef ITT_ARCH_PPC64
+#  define ITT_ARCH_PPC64  5
+#endif /* ITT_ARCH_PPC64 */
+
+#ifndef ITT_ARCH
+#  if defined _M_IX86 || defined __i386__
+#    define ITT_ARCH ITT_ARCH_IA32
+#  elif defined _M_X64 || defined _M_AMD64 || defined __x86_64__
+#    define ITT_ARCH ITT_ARCH_IA32E
+#  elif defined _M_IA64 || defined __ia64__
+#    define ITT_ARCH ITT_ARCH_IA64
+#  elif defined _M_ARM || defined __arm__
+#    define ITT_ARCH ITT_ARCH_ARM
+#  elif defined __powerpc64__
+#    define ITT_ARCH ITT_ARCH_PPC64
+#  endif
+#endif
+
+#ifdef __cplusplus
+#  define ITT_EXTERN_C extern "C"
+#  define ITT_EXTERN_C_BEGIN extern "C" {
+#  define ITT_EXTERN_C_END }
+#else
+#  define ITT_EXTERN_C /* nothing */
+#  define ITT_EXTERN_C_BEGIN /* nothing */
+#  define ITT_EXTERN_C_END /* nothing */
+#endif /* __cplusplus */
+
+#define ITT_TO_STR_AUX(x) #x
+#define ITT_TO_STR(x)     ITT_TO_STR_AUX(x)
+
+#define __ITT_BUILD_ASSERT(expr, suffix) do { \
+    static char __itt_build_check_##suffix[(expr) ? 1 : -1]; \
+    __itt_build_check_##suffix[0] = 0; \
+} while(0)
+#define _ITT_BUILD_ASSERT(expr, suffix)  __ITT_BUILD_ASSERT((expr), suffix)
+#define ITT_BUILD_ASSERT(expr)           _ITT_BUILD_ASSERT((expr), __LINE__)
+
+#define ITT_MAGIC { 0xED, 0xAB, 0xAB, 0xEC, 0x0D, 0xEE, 0xDA, 0x30 }
+
+/* Replace with snapshot date YYYYMMDD for promotion build. */
+#define API_VERSION_BUILD    20180723
+
+#ifndef API_VERSION_NUM
+#define API_VERSION_NUM 0.0.0
+#endif /* API_VERSION_NUM */
+
+#define API_VERSION "ITT-API-Version " ITT_TO_STR(API_VERSION_NUM) \
+                                " (" ITT_TO_STR(API_VERSION_BUILD) ")"
+
+/* OS communication functions */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#include <windows.h>
+typedef HMODULE           lib_t;
+typedef DWORD             TIDT;
+typedef CRITICAL_SECTION  mutex_t;
+#define MUTEX_INITIALIZER { 0 }
+#define strong_alias(name, aliasname) /* empty for Windows */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <dlfcn.h>
+#if defined(UNICODE) || defined(_UNICODE)
+#include <wchar.h>
+#endif /* UNICODE */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE 1 /* need for PTHREAD_MUTEX_RECURSIVE */
+#endif /* _GNU_SOURCE */
+#ifndef __USE_UNIX98
+#define __USE_UNIX98 1 /* need for PTHREAD_MUTEX_RECURSIVE, on SLES11.1 with gcc 4.3.4 wherein pthread.h missing dependency on __USE_XOPEN2K8 */
+#endif /*__USE_UNIX98*/
+#include <pthread.h>
+typedef void*             lib_t;
+typedef pthread_t         TIDT;
+typedef pthread_mutex_t   mutex_t;
+#define MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
+#define _strong_alias(name, aliasname) \
+            extern __typeof (name) aliasname __attribute__ ((alias (#name)));
+#define strong_alias(name, aliasname) _strong_alias(name, aliasname)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_get_proc(lib, name) GetProcAddress(lib, name)
+#define __itt_mutex_init(mutex)   InitializeCriticalSection(mutex)
+#define __itt_mutex_lock(mutex)   EnterCriticalSection(mutex)
+#define __itt_mutex_unlock(mutex) LeaveCriticalSection(mutex)
+#define __itt_load_lib(name)      LoadLibraryA(name)
+#define __itt_unload_lib(handle)  FreeLibrary(handle)
+#define __itt_system_error()      (int)GetLastError()
+#define __itt_fstrcmp(s1, s2)     lstrcmpA(s1, s2)
+#define __itt_fstrnlen(s, l)      strnlen_s(s, l)
+#define __itt_fstrcpyn(s1, b, s2, l) strncpy_s(s1, b, s2, l)
+#define __itt_fstrdup(s)          _strdup(s)
+#define __itt_thread_id()         GetCurrentThreadId()
+#define __itt_thread_yield()      SwitchToThread()
+#ifndef ITT_SIMPLE_INIT
+ITT_INLINE long
+__itt_interlocked_increment(volatile long* ptr) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE long __itt_interlocked_increment(volatile long* ptr)
+{
+    return InterlockedIncrement(ptr);
+}
+#endif /* ITT_SIMPLE_INIT */
+
+#define DL_SYMBOLS (1)
+#define PTHREAD_SYMBOLS (1)
+
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+#define __itt_get_proc(lib, name) dlsym(lib, name)
+#define __itt_mutex_init(mutex)   {\
+    pthread_mutexattr_t mutex_attr;                                         \
+    int error_code = pthread_mutexattr_init(&mutex_attr);                   \
+    if (error_code)                                                         \
+        __itt_report_error(__itt_error_system, "pthread_mutexattr_init",    \
+                           error_code);                                     \
+    error_code = pthread_mutexattr_settype(&mutex_attr,                     \
+                                           PTHREAD_MUTEX_RECURSIVE);        \
+    if (error_code)                                                         \
+        __itt_report_error(__itt_error_system, "pthread_mutexattr_settype", \
+                           error_code);                                     \
+    error_code = pthread_mutex_init(mutex, &mutex_attr);                    \
+    if (error_code)                                                         \
+        __itt_report_error(__itt_error_system, "pthread_mutex_init",        \
+                           error_code);                                     \
+    error_code = pthread_mutexattr_destroy(&mutex_attr);                    \
+    if (error_code)                                                         \
+        __itt_report_error(__itt_error_system, "pthread_mutexattr_destroy", \
+                           error_code);                                     \
+}
+#define __itt_mutex_lock(mutex)   pthread_mutex_lock(mutex)
+#define __itt_mutex_unlock(mutex) pthread_mutex_unlock(mutex)
+#define __itt_load_lib(name)      dlopen(name, RTLD_LAZY)
+#define __itt_unload_lib(handle)  dlclose(handle)
+#define __itt_system_error()      errno
+#define __itt_fstrcmp(s1, s2)     strcmp(s1, s2)
+
+/* makes customer code define safe APIs for SDL_STRNLEN_S and SDL_STRNCPY_S */
+#ifdef SDL_STRNLEN_S
+#define __itt_fstrnlen(s, l)      SDL_STRNLEN_S(s, l)
+#else
+#define __itt_fstrnlen(s, l)      strlen(s)
+#endif /* SDL_STRNLEN_S */
+#ifdef SDL_STRNCPY_S
+#define __itt_fstrcpyn(s1, b, s2, l) SDL_STRNCPY_S(s1, b, s2, l)
+#else
+#define __itt_fstrcpyn(s1, b, s2, l) {                                      \
+    if (b > 0) {                                                            \
+        /* 'volatile' is used to suppress the warning that a destination */ \
+        /*  bound depends on the length of the source.                   */ \
+        volatile size_t num_to_copy = (size_t)(b - 1) < (size_t)(l) ?       \
+                (size_t)(b - 1) : (size_t)(l);                              \
+        strncpy(s1, s2, num_to_copy);                                       \
+        s1[num_to_copy] = 0;                                                \
+    }                                                                       \
+}
+#endif /* SDL_STRNCPY_S */
+
+#define __itt_fstrdup(s)          strdup(s)
+#define __itt_thread_id()         pthread_self()
+#define __itt_thread_yield()      sched_yield()
+#if ITT_ARCH==ITT_ARCH_IA64
+#ifdef __INTEL_COMPILER
+#define __TBB_machine_fetchadd4(addr, val) __fetchadd4_acq((void *)addr, val)
+#else  /* __INTEL_COMPILER */
+/* TODO: Add Support for not Intel compilers for IA-64 architecture */
+#endif /* __INTEL_COMPILER */
+#elif ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_IA32E /* ITT_ARCH!=ITT_ARCH_IA64 */
+ITT_INLINE long
+__TBB_machine_fetchadd4(volatile void* ptr, long addend) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE long __TBB_machine_fetchadd4(volatile void* ptr, long addend)
+{
+    long result;
+    __asm__ __volatile__("lock\nxadd %0,%1"
+                          : "=r"(result),"=m"(*(int*)ptr)
+                          : "0"(addend), "m"(*(int*)ptr)
+                          : "memory");
+    return result;
+}
+#elif ITT_ARCH==ITT_ARCH_ARM || ITT_ARCH==ITT_ARCH_PPC64
+#define __TBB_machine_fetchadd4(addr, val) __sync_fetch_and_add(addr, val)
+#endif /* ITT_ARCH==ITT_ARCH_IA64 */
+#ifndef ITT_SIMPLE_INIT
+ITT_INLINE long
+__itt_interlocked_increment(volatile long* ptr) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE long __itt_interlocked_increment(volatile long* ptr)
+{
+    return __TBB_machine_fetchadd4(ptr, 1) + 1L;
+}
+#endif /* ITT_SIMPLE_INIT */
+
+void* dlopen(const char*, int) __attribute__((weak));
+void* dlsym(void*, const char*) __attribute__((weak));
+int dlclose(void*) __attribute__((weak));
+#define DL_SYMBOLS (dlopen && dlsym && dlclose)
+
+int pthread_mutex_init(pthread_mutex_t*, const pthread_mutexattr_t*) __attribute__((weak));
+int pthread_mutex_lock(pthread_mutex_t*) __attribute__((weak));
+int pthread_mutex_unlock(pthread_mutex_t*) __attribute__((weak));
+int pthread_mutex_destroy(pthread_mutex_t*) __attribute__((weak));
+int pthread_mutexattr_init(pthread_mutexattr_t*) __attribute__((weak));
+int pthread_mutexattr_settype(pthread_mutexattr_t*, int) __attribute__((weak));
+int pthread_mutexattr_destroy(pthread_mutexattr_t*) __attribute__((weak));
+pthread_t pthread_self(void) __attribute__((weak));
+#define PTHREAD_SYMBOLS (pthread_mutex_init && pthread_mutex_lock && pthread_mutex_unlock && pthread_mutex_destroy && pthread_mutexattr_init && pthread_mutexattr_settype && pthread_mutexattr_destroy && pthread_self)
+
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+typedef enum {
+    __itt_collection_normal = 0,
+    __itt_collection_paused = 1
+} __itt_collection_state;
+
+typedef enum {
+    __itt_thread_normal  = 0,
+    __itt_thread_ignored = 1
+} __itt_thread_state;
+
+#pragma pack(push, 8)
+
+typedef struct ___itt_thread_info
+{
+    const char* nameA; /*!< Copy of original name in ASCII. */
+#if defined(UNICODE) || defined(_UNICODE)
+    const wchar_t* nameW; /*!< Copy of original name in UNICODE. */
+#else  /* UNICODE || _UNICODE */
+    void* nameW;
+#endif /* UNICODE || _UNICODE */
+    TIDT               tid;
+    __itt_thread_state state;   /*!< Thread state (paused or normal) */
+    int                extra1;  /*!< Reserved to the runtime */
+    void*              extra2;  /*!< Reserved to the runtime */
+    struct ___itt_thread_info* next;
+} __itt_thread_info;
+
+#include "ittnotify_types.h" /* For __itt_group_id definition */
+
+typedef struct ___itt_api_info_20101001
+{
+    const char*    name;
+    void**         func_ptr;
+    void*          init_func;
+    __itt_group_id group;
+}  __itt_api_info_20101001;
+
+typedef struct ___itt_api_info
+{
+    const char*    name;
+    void**         func_ptr;
+    void*          init_func;
+    void*          null_func;
+    __itt_group_id group;
+}  __itt_api_info;
+
+typedef struct __itt_counter_info
+{
+    const char* nameA;  /*!< Copy of original name in ASCII. */
+#if defined(UNICODE) || defined(_UNICODE)
+    const wchar_t* nameW; /*!< Copy of original name in UNICODE. */
+#else  /* UNICODE || _UNICODE */
+    void* nameW;
+#endif /* UNICODE || _UNICODE */
+    const char* domainA;  /*!< Copy of original name in ASCII. */
+#if defined(UNICODE) || defined(_UNICODE)
+    const wchar_t* domainW; /*!< Copy of original name in UNICODE. */
+#else  /* UNICODE || _UNICODE */
+    void* domainW;
+#endif /* UNICODE || _UNICODE */
+    int type;
+    long index;
+    int   extra1; /*!< Reserved to the runtime */
+    void* extra2; /*!< Reserved to the runtime */
+    struct __itt_counter_info* next;
+}  __itt_counter_info_t;
+
+struct ___itt_domain;
+struct ___itt_string_handle;
+
+typedef struct ___itt_global
+{
+    unsigned char          magic[8];
+    unsigned long          version_major;
+    unsigned long          version_minor;
+    unsigned long          version_build;
+    volatile long          api_initialized;
+    volatile long          mutex_initialized;
+    volatile long          atomic_counter;
+    mutex_t                mutex;
+    lib_t                  lib;
+    void*                  error_handler;
+    const char**           dll_path_ptr;
+    __itt_api_info*        api_list_ptr;
+    struct ___itt_global*  next;
+    /* Joinable structures below */
+    __itt_thread_info*     thread_list;
+    struct ___itt_domain*  domain_list;
+    struct ___itt_string_handle* string_list;
+    __itt_collection_state state;
+    __itt_counter_info_t* counter_list;
+    unsigned int           ipt_collect_events;
+} __itt_global;
+
+#pragma pack(pop)
+
+#define NEW_THREAD_INFO_W(gptr,h,h_tail,t,s,n) { \
+    h = (__itt_thread_info*)malloc(sizeof(__itt_thread_info)); \
+    if (h != NULL) { \
+        h->tid    = t; \
+        h->nameA  = NULL; \
+        h->nameW  = n ? _wcsdup(n) : NULL; \
+        h->state  = s; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->thread_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_THREAD_INFO_A(gptr,h,h_tail,t,s,n) { \
+    h = (__itt_thread_info*)malloc(sizeof(__itt_thread_info)); \
+    if (h != NULL) { \
+        h->tid    = t; \
+        h->nameA  = n ? __itt_fstrdup(n) : NULL; \
+        h->nameW  = NULL; \
+        h->state  = s; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->thread_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_DOMAIN_W(gptr,h,h_tail,name) { \
+    h = (__itt_domain*)malloc(sizeof(__itt_domain)); \
+    if (h != NULL) { \
+        h->flags  = 1;    /* domain is enabled by default */ \
+        h->nameA  = NULL; \
+        h->nameW  = name ? _wcsdup(name) : NULL; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->domain_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_DOMAIN_A(gptr,h,h_tail,name) { \
+    h = (__itt_domain*)malloc(sizeof(__itt_domain)); \
+    if (h != NULL) { \
+        h->flags  = 1;    /* domain is enabled by default */ \
+        h->nameA  = name ? __itt_fstrdup(name) : NULL; \
+        h->nameW  = NULL; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->domain_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_STRING_HANDLE_W(gptr,h,h_tail,name) { \
+    h = (__itt_string_handle*)malloc(sizeof(__itt_string_handle)); \
+    if (h != NULL) { \
+        h->strA   = NULL; \
+        h->strW   = name ? _wcsdup(name) : NULL; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->string_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_STRING_HANDLE_A(gptr,h,h_tail,name) { \
+    h = (__itt_string_handle*)malloc(sizeof(__itt_string_handle)); \
+    if (h != NULL) { \
+        h->strA   = name ? __itt_fstrdup(name) : NULL; \
+        h->strW   = NULL; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->string_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_COUNTER_W(gptr,h,h_tail,name,domain,type) { \
+    h = (__itt_counter_info_t*)malloc(sizeof(__itt_counter_info_t)); \
+    if (h != NULL) { \
+        h->nameA   = NULL; \
+        h->nameW   = name ? _wcsdup(name) : NULL; \
+        h->domainA   = NULL; \
+        h->domainW   = name ? _wcsdup(domain) : NULL; \
+        h->type = type; \
+        h->index = 0; \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->counter_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_COUNTER_A(gptr,h,h_tail,name,domain,type) { \
+    h = (__itt_counter_info_t*)malloc(sizeof(__itt_counter_info_t)); \
+    if (h != NULL) { \
+        h->nameA   = name ? __itt_fstrdup(name) : NULL; \
+        h->nameW   = NULL; \
+        h->domainA   = domain ? __itt_fstrdup(domain) : NULL; \
+        h->domainW   = NULL; \
+        h->type = type; \
+        h->index = 0; \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->counter_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#endif /* _ITTNOTIFY_CONFIG_H_ */
diff --git a/contrib/libs/tbb/src/tbb/tools_api/ittnotify_static.c b/contrib/libs/tbb/src/tbb/tools_api/ittnotify_static.c
new file mode 100644
index 0000000000..dd8ca8e755
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/tools_api/ittnotify_static.c
@@ -0,0 +1,1244 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "ittnotify_config.h"
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define PATH_MAX 512
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+#include <limits.h>
+#include <dlfcn.h>
+#include <errno.h>
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+
+#define INTEL_NO_MACRO_BODY
+#define INTEL_ITTNOTIFY_API_PRIVATE
+#include "ittnotify.h"
+#include "legacy/ittnotify.h"
+
+#include "disable_warnings.h"
+
+static const char api_version[] = API_VERSION "\0\n@(#) $Revision$\n";
+
+#define _N_(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX,n)
+
+#if ITT_OS==ITT_OS_WIN
+static const char* ittnotify_lib_name = "libittnotify.dll";
+#elif ITT_OS==ITT_OS_LINUX || ITT_OS==ITT_OS_FREEBSD
+static const char* ittnotify_lib_name = "libittnotify.so";
+#elif ITT_OS==ITT_OS_MAC
+static const char* ittnotify_lib_name = "libittnotify.dylib";
+#else
+#error Unsupported or unknown OS.
+#endif
+
+#ifdef __ANDROID__
+#include <android/log.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <linux/limits.h>
+
+#ifdef ITT_ANDROID_LOG
+    #define ITT_ANDROID_LOG_TAG   "INTEL_VTUNE_USERAPI"
+    #define ITT_ANDROID_LOGI(...) ((void)__android_log_print(ANDROID_LOG_INFO, ITT_ANDROID_LOG_TAG, __VA_ARGS__))
+    #define ITT_ANDROID_LOGW(...) ((void)__android_log_print(ANDROID_LOG_WARN, ITT_ANDROID_LOG_TAG, __VA_ARGS__))
+    #define ITT_ANDROID_LOGE(...) ((void)__android_log_print(ANDROID_LOG_ERROR,ITT_ANDROID_LOG_TAG, __VA_ARGS__))
+    #define ITT_ANDROID_LOGD(...) ((void)__android_log_print(ANDROID_LOG_DEBUG,ITT_ANDROID_LOG_TAG, __VA_ARGS__))
+#else
+    #define ITT_ANDROID_LOGI(...)
+    #define ITT_ANDROID_LOGW(...)
+    #define ITT_ANDROID_LOGE(...)
+    #define ITT_ANDROID_LOGD(...)
+#endif
+
+/* default location of userapi collector on Android */
+#define ANDROID_ITTNOTIFY_DEFAULT_PATH_MASK(x)  "/data/data/com.intel.vtune/perfrun/lib" \
+                                                #x "/runtime/libittnotify.so"
+
+#if ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_ARM
+#define ANDROID_ITTNOTIFY_DEFAULT_PATH  ANDROID_ITTNOTIFY_DEFAULT_PATH_MASK(32)
+#else
+#define ANDROID_ITTNOTIFY_DEFAULT_PATH  ANDROID_ITTNOTIFY_DEFAULT_PATH_MASK(64)
+#endif
+
+#endif
+
+
+#ifndef LIB_VAR_NAME
+#if ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_ARM
+#define LIB_VAR_NAME INTEL_LIBITTNOTIFY32
+#else
+#define LIB_VAR_NAME INTEL_LIBITTNOTIFY64
+#endif
+#endif /* LIB_VAR_NAME */
+
+#define ITT_MUTEX_INIT_AND_LOCK(p) {                                 \
+    if (PTHREAD_SYMBOLS)                                             \
+    {                                                                \
+        if (!p.mutex_initialized)                                    \
+        {                                                            \
+            if (__itt_interlocked_increment(&p.atomic_counter) == 1) \
+            {                                                        \
+                __itt_mutex_init(&p.mutex);                          \
+                p.mutex_initialized = 1;                             \
+            }                                                        \
+            else                                                     \
+                while (!p.mutex_initialized)                         \
+                    __itt_thread_yield();                            \
+        }                                                            \
+        __itt_mutex_lock(&p.mutex);                                  \
+    }                                                                \
+}
+
+const int _N_(err) = 0;
+
+typedef int (__itt_init_ittlib_t)(const char*, __itt_group_id);
+
+/* this define used to control initialization function name. */
+#ifndef __itt_init_ittlib_name
+ITT_EXTERN_C int _N_(init_ittlib)(const char*, __itt_group_id);
+static __itt_init_ittlib_t* __itt_init_ittlib_ptr = _N_(init_ittlib);
+#define __itt_init_ittlib_name __itt_init_ittlib_ptr
+#endif /* __itt_init_ittlib_name */
+
+typedef void (__itt_fini_ittlib_t)(void);
+
+/* this define used to control finalization function name. */
+#ifndef __itt_fini_ittlib_name
+ITT_EXTERN_C void _N_(fini_ittlib)(void);
+static __itt_fini_ittlib_t* __itt_fini_ittlib_ptr = _N_(fini_ittlib);
+#define __itt_fini_ittlib_name __itt_fini_ittlib_ptr
+#endif /* __itt_fini_ittlib_name */
+
+extern __itt_global _N_(_ittapi_global);
+
+/* building pointers to imported funcs */
+#undef ITT_STUBV
+#undef ITT_STUB
+#define ITT_STUB(api,type,name,args,params,ptr,group,format)   \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
+typedef type api ITT_JOIN(_N_(name),_t) args;                  \
+ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args \
+{                                                              \
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL) \
+        __itt_init_ittlib_name(NULL, __itt_group_all);         \
+    if (ITTNOTIFY_NAME(name) && ITTNOTIFY_NAME(name) != ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init))) \
+        return ITTNOTIFY_NAME(name) params;                    \
+    else                                                       \
+        return (type)0;                                        \
+}
+
+#define ITT_STUBV(api,type,name,args,params,ptr,group,format)  \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
+typedef type api ITT_JOIN(_N_(name),_t) args;                  \
+ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args \
+{                                                              \
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL) \
+        __itt_init_ittlib_name(NULL, __itt_group_all);         \
+    if (ITTNOTIFY_NAME(name) && ITTNOTIFY_NAME(name) != ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init))) \
+        ITTNOTIFY_NAME(name) params;                           \
+    else                                                       \
+        return;                                                \
+}
+
+#undef __ITT_INTERNAL_INIT
+#include "ittnotify_static.h"
+
+#undef ITT_STUB
+#undef ITT_STUBV
+#define ITT_STUB(api,type,name,args,params,ptr,group,format)   \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
+typedef type api ITT_JOIN(_N_(name),_t) args;                  \
+ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END
+
+#define ITT_STUBV(api,type,name,args,params,ptr,group,format)  \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
+typedef type api ITT_JOIN(_N_(name),_t) args;                  \
+ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END
+
+#define __ITT_INTERNAL_INIT
+#include "ittnotify_static.h"
+#undef __ITT_INTERNAL_INIT
+
+ITT_GROUP_LIST(group_list);
+
+#pragma pack(push, 8)
+
+typedef struct ___itt_group_alias
+{
+    const char*    env_var;
+    __itt_group_id groups;
+} __itt_group_alias;
+
+static __itt_group_alias group_alias[] = {
+    { "KMP_FOR_TPROFILE", (__itt_group_id)(__itt_group_control | __itt_group_thread | __itt_group_sync  | __itt_group_mark) },
+    { "KMP_FOR_TCHECK",   (__itt_group_id)(__itt_group_control | __itt_group_thread | __itt_group_sync  | __itt_group_fsync | __itt_group_mark | __itt_group_suppress) },
+    { NULL,               (__itt_group_none) },
+    { api_version,        (__itt_group_none) } /* !!! Just to avoid unused code elimination !!! */
+};
+
+#pragma pack(pop)
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#pragma warning(push)
+#pragma warning(disable: 4054) /* warning C4054: 'type cast' : from function pointer 'XXX' to data pointer 'void *' */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+static __itt_api_info api_list[] = {
+/* Define functions with static implementation */
+#undef ITT_STUB
+#undef ITT_STUBV
+#define ITT_STUB(api,type,name,args,params,nameindll,group,format) { ITT_TO_STR(ITT_JOIN(__itt_,nameindll)), (void**)(void*)&ITTNOTIFY_NAME(name), (void*)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)), (void*)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)), (__itt_group_id)(group)},
+#define ITT_STUBV ITT_STUB
+#define __ITT_INTERNAL_INIT
+#include "ittnotify_static.h"
+#undef __ITT_INTERNAL_INIT
+/* Define functions without static implementation */
+#undef ITT_STUB
+#undef ITT_STUBV
+#define ITT_STUB(api,type,name,args,params,nameindll,group,format) {ITT_TO_STR(ITT_JOIN(__itt_,nameindll)), (void**)(void*)&ITTNOTIFY_NAME(name), (void*)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)), NULL, (__itt_group_id)(group)},
+#define ITT_STUBV ITT_STUB
+#include "ittnotify_static.h"
+    {NULL, NULL, NULL, NULL, __itt_group_none}
+};
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#pragma warning(pop)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+static const char dll_path[PATH_MAX] = { 0 };
+
+/* static part descriptor which handles. all notification api attributes. */
+__itt_global _N_(_ittapi_global) = {
+    ITT_MAGIC,                                     /* identification info */
+    ITT_MAJOR, ITT_MINOR, API_VERSION_BUILD,       /* version info */
+    0,                                             /* api_initialized */
+    0,                                             /* mutex_initialized */
+    0,                                             /* atomic_counter */
+    MUTEX_INITIALIZER,                             /* mutex */
+    NULL,                                          /* dynamic library handle */
+    NULL,                                          /* error_handler */
+    (const char**)&dll_path,                       /* dll_path_ptr */
+    (__itt_api_info*)&api_list,                    /* api_list_ptr */
+    NULL,                                          /* next __itt_global */
+    NULL,                                          /* thread_list */
+    NULL,                                          /* domain_list */
+    NULL,                                          /* string_list */
+    __itt_collection_normal,                       /* collection state */
+    NULL,                                          /* counter_list */
+    0                                              /* ipt_collect_events */
+};
+
+typedef void (__itt_api_init_t)(__itt_global*, __itt_group_id);
+typedef void (__itt_api_fini_t)(__itt_global*);
+
+/* ========================================================================= */
+
+#ifdef ITT_NOTIFY_EXT_REPORT
+ITT_EXTERN_C void _N_(error_handler)(__itt_error_code, va_list args);
+#endif /* ITT_NOTIFY_EXT_REPORT */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#pragma warning(push)
+#pragma warning(disable: 4055) /* warning C4055: 'type cast' : from data pointer 'void *' to function pointer 'XXX' */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+static void __itt_report_error_impl(int code, ...) {
+    va_list args;
+    va_start(args, code);
+    if (_N_(_ittapi_global).error_handler != NULL)
+    {
+        __itt_error_handler_t* handler = (__itt_error_handler_t*)(size_t)_N_(_ittapi_global).error_handler;
+        handler((__itt_error_code)code, args);
+    }
+#ifdef ITT_NOTIFY_EXT_REPORT
+    _N_(error_handler)(code, args);
+#endif /* ITT_NOTIFY_EXT_REPORT */
+    va_end(args);
+}
+
+//va_start cannot take enum (__itt_error_code) on clang, so it is necessary to transform it to int
+#define __itt_report_error(code, ...) \
+                __itt_report_error_impl((int)code,__VA_ARGS__)
+
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#pragma warning(pop)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static __itt_domain* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createW),_init))(const wchar_t* name)
+{
+    __itt_domain *h_tail = NULL, *h = NULL;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+        if (ITTNOTIFY_NAME(domain_createW) && ITTNOTIFY_NAME(domain_createW) != ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createW),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(domain_createW)(name);
+        }
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).domain_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->nameW != NULL && !wcscmp(h->nameW, name)) break;
+    }
+    if (h == NULL)
+    {
+        NEW_DOMAIN_W(&_N_(_ittapi_global),h,h_tail,name);
+    }
+    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return h;
+}
+
+static __itt_domain* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createA),_init))(const char* name)
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+static __itt_domain* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(domain_create),_init))(const char* name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+{
+    __itt_domain *h_tail = NULL, *h = NULL;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        if (ITTNOTIFY_NAME(domain_createA) && ITTNOTIFY_NAME(domain_createA) != ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createA),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(domain_createA)(name);
+        }
+#else
+        if (ITTNOTIFY_NAME(domain_create) && ITTNOTIFY_NAME(domain_create) != ITT_VERSIONIZE(ITT_JOIN(_N_(domain_create),_init)))
+        {
+            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(domain_create)(name);
+        }
+#endif
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).domain_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->nameA != NULL && !__itt_fstrcmp(h->nameA, name)) break;
+    }
+    if (h == NULL)
+    {
+        NEW_DOMAIN_A(&_N_(_ittapi_global),h,h_tail,name);
+    }
+    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return h;
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static __itt_string_handle* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createW),_init))(const wchar_t* name)
+{
+    __itt_string_handle *h_tail = NULL, *h = NULL;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+        if (ITTNOTIFY_NAME(string_handle_createW) && ITTNOTIFY_NAME(string_handle_createW) != ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createW),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(string_handle_createW)(name);
+        }
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).string_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->strW != NULL && !wcscmp(h->strW, name)) break;
+    }
+    if (h == NULL)
+    {
+        NEW_STRING_HANDLE_W(&_N_(_ittapi_global),h,h_tail,name);
+    }
+    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return h;
+}
+
+static __itt_string_handle* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createA),_init))(const char* name)
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+static __itt_string_handle* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_create),_init))(const char* name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+{
+    __itt_string_handle *h_tail = NULL, *h = NULL;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        if (ITTNOTIFY_NAME(string_handle_createA) && ITTNOTIFY_NAME(string_handle_createA) != ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createA),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(string_handle_createA)(name);
+        }
+#else
+        if (ITTNOTIFY_NAME(string_handle_create) && ITTNOTIFY_NAME(string_handle_create) != ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_create),_init)))
+        {
+            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(string_handle_create)(name);
+        }
+#endif
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).string_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->strA != NULL && !__itt_fstrcmp(h->strA, name)) break;
+    }
+    if (h == NULL)
+    {
+        NEW_STRING_HANDLE_A(&_N_(_ittapi_global),h,h_tail,name);
+    }
+    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return h;
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createW),_init))(const wchar_t *name, const wchar_t *domain)
+{
+    __itt_counter_info_t *h_tail = NULL, *h = NULL;
+    __itt_metadata_type type = __itt_metadata_u64;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+        if (ITTNOTIFY_NAME(counter_createW) && ITTNOTIFY_NAME(counter_createW) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createW),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(counter_createW)(name, domain);
+        }
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->nameW != NULL  && h->type == type && !wcscmp(h->nameW, name) && ((h->domainW == NULL && domain == NULL) ||
+            (h->domainW != NULL && domain != NULL && !wcscmp(h->domainW, domain)))) break;
+
+    }
+    if (h == NULL)
+    {
+        NEW_COUNTER_W(&_N_(_ittapi_global),h,h_tail,name,domain,type);
+    }
+    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return (__itt_counter)h;
+}
+
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createA),_init))(const char *name, const char *domain)
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create),_init))(const char *name, const char *domain)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+{
+    __itt_counter_info_t *h_tail = NULL, *h = NULL;
+    __itt_metadata_type type = __itt_metadata_u64;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        if (ITTNOTIFY_NAME(counter_createA) && ITTNOTIFY_NAME(counter_createA) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createA),_init)))
+        {
+    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(counter_createA)(name, domain);
+        }
+#else
+        if (ITTNOTIFY_NAME(counter_create) && ITTNOTIFY_NAME(counter_create) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create),_init)))
+        {
+            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(counter_create)(name, domain);
+        }
+#endif
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->nameA != NULL  && h->type == type && !__itt_fstrcmp(h->nameA, name) && ((h->domainA == NULL && domain == NULL) || 
+            (h->domainA != NULL && domain != NULL && !__itt_fstrcmp(h->domainA, domain)))) break;
+    }
+    if (h == NULL)
+    {
+       NEW_COUNTER_A(&_N_(_ittapi_global),h,h_tail,name,domain,type);
+    }
+    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return (__itt_counter)h;
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedW),_init))(const wchar_t *name, const wchar_t *domain, __itt_metadata_type type)
+{
+    __itt_counter_info_t *h_tail = NULL, *h = NULL;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+        if (ITTNOTIFY_NAME(counter_create_typedW) && ITTNOTIFY_NAME(counter_create_typedW) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedW),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(counter_create_typedW)(name, domain, type);
+        }
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->nameW != NULL  && h->type == type && !wcscmp(h->nameW, name) && ((h->domainW == NULL && domain == NULL) ||
+            (h->domainW != NULL && domain != NULL && !wcscmp(h->domainW, domain)))) break;
+
+    }
+    if (h == NULL)
+    {
+        NEW_COUNTER_W(&_N_(_ittapi_global),h,h_tail,name,domain,type);
+    }
+    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return (__itt_counter)h;
+}
+
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedA),_init))(const char *name, const char *domain, __itt_metadata_type type)
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typed),_init))(const char *name, const char *domain, __itt_metadata_type type)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+{
+    __itt_counter_info_t *h_tail = NULL, *h = NULL;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        if (ITTNOTIFY_NAME(counter_create_typedA) && ITTNOTIFY_NAME(counter_create_typedA) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedA),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(counter_create_typedA)(name, domain, type);
+        }
+#else
+        if (ITTNOTIFY_NAME(counter_create_typed) && ITTNOTIFY_NAME(counter_create_typed) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typed),_init)))
+        {
+            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(counter_create_typed)(name, domain, type);
+        }
+#endif
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->nameA != NULL  && h->type == type && !__itt_fstrcmp(h->nameA, name) && ((h->domainA == NULL && domain == NULL) || 
+            (h->domainA != NULL && domain != NULL && !__itt_fstrcmp(h->domainA, domain)))) break;
+    }
+    if (h == NULL)
+    {
+       NEW_COUNTER_A(&_N_(_ittapi_global),h,h_tail,name,domain,type);
+    }
+    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return (__itt_counter)h;
+}
+
+/* -------------------------------------------------------------------------- */
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(pause),_init))(void)
+{
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+    }
+    if (ITTNOTIFY_NAME(pause) && ITTNOTIFY_NAME(pause) != ITT_VERSIONIZE(ITT_JOIN(_N_(pause),_init)))
+    {
+        ITTNOTIFY_NAME(pause)();
+    }
+    else
+    {
+        _N_(_ittapi_global).state = __itt_collection_paused;
+    }
+}
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(resume),_init))(void)
+{
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+    }
+    if (ITTNOTIFY_NAME(resume) && ITTNOTIFY_NAME(resume) != ITT_VERSIONIZE(ITT_JOIN(_N_(resume),_init)))
+    {
+        ITTNOTIFY_NAME(resume)();
+    }
+    else
+    {
+        _N_(_ittapi_global).state = __itt_collection_normal;
+    }
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init))(const wchar_t* name)
+{
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+    }
+    if (ITTNOTIFY_NAME(thread_set_nameW) && ITTNOTIFY_NAME(thread_set_nameW) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init)))
+    {
+        ITTNOTIFY_NAME(thread_set_nameW)(name);
+    }
+}
+
+static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_setW),_init))(const wchar_t* name, int namelen)
+{
+    (void)namelen;
+    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init))(name);
+    return 0;
+}
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA),_init))(const char* name)
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name),_init))(const char* name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+{
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+    }
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+    if (ITTNOTIFY_NAME(thread_set_nameA) && ITTNOTIFY_NAME(thread_set_nameA) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA),_init)))
+    {
+        ITTNOTIFY_NAME(thread_set_nameA)(name);
+    }
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    if (ITTNOTIFY_NAME(thread_set_name) && ITTNOTIFY_NAME(thread_set_name) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name),_init)))
+    {
+        ITTNOTIFY_NAME(thread_set_name)(name);
+    }
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_setA),_init))(const char* name, int namelen)
+{
+    (void)namelen;
+    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA),_init))(name);
+    return 0;
+}
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_set),_init))(const char* name, int namelen)
+{
+    (void)namelen;
+    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name),_init))(name);
+    return 0;
+}
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore),_init))(void)
+{
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+    }
+    if (ITTNOTIFY_NAME(thread_ignore) && ITTNOTIFY_NAME(thread_ignore) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore),_init)))
+    {
+        ITTNOTIFY_NAME(thread_ignore)();
+    }
+}
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_ignore),_init))(void)
+{
+    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore),_init))();
+}
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(enable_attach),_init))(void)
+{
+#ifdef __ANDROID__
+    /*
+     * if LIB_VAR_NAME env variable were set before then stay previous value
+     * else set default path
+    */
+    setenv(ITT_TO_STR(LIB_VAR_NAME), ANDROID_ITTNOTIFY_DEFAULT_PATH, 0);
+#endif
+}
+
+/* -------------------------------------------------------------------------- */
+
+static const char* __itt_fsplit(const char* s, const char* sep, const char** out, int* len)
+{
+    int i;
+    int j;
+
+    if (!s || !sep || !out || !len)
+        return NULL;
+
+    for (i = 0; s[i]; i++)
+    {
+        int b = 0;
+        for (j = 0; sep[j]; j++)
+            if (s[i] == sep[j])
+            {
+                b = 1;
+                break;
+            }
+        if (!b)
+            break;
+    }
+
+    if (!s[i])
+        return NULL;
+
+    *len = 0;
+    *out = &s[i];
+
+    for (; s[i]; i++, (*len)++)
+    {
+        int b = 0;
+        for (j = 0; sep[j]; j++)
+            if (s[i] == sep[j])
+            {
+                b = 1;
+                break;
+            }
+        if (b)
+            break;
+    }
+
+    for (; s[i]; i++)
+    {
+        int b = 0;
+        for (j = 0; sep[j]; j++)
+            if (s[i] == sep[j])
+            {
+                b = 1;
+                break;
+            }
+        if (!b)
+            break;
+    }
+
+    return &s[i];
+}
+
+/* This function return value of env variable that placed into static buffer.
+ * !!! The same static buffer is used for subsequent calls. !!!
+ * This was done to avoid dynamic allocation for few calls.
+ * Actually we need this function only four times.
+ */
+static const char* __itt_get_env_var(const char* name)
+{
+#define MAX_ENV_VALUE_SIZE 4086
+    static char  env_buff[MAX_ENV_VALUE_SIZE];
+    static char* env_value = (char*)env_buff;
+
+    if (name != NULL)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        size_t max_len = MAX_ENV_VALUE_SIZE - (size_t)(env_value - env_buff);
+        DWORD rc = GetEnvironmentVariableA(name, env_value, (DWORD)max_len);
+        if (rc >= max_len)
+            __itt_report_error(__itt_error_env_too_long, name, (size_t)rc - 1, (size_t)(max_len - 1));
+        else if (rc > 0)
+        {
+            const char* ret = (const char*)env_value;
+            env_value += rc + 1;
+            return ret;
+        }
+        else
+        {
+            /* If environment variable is empty, GetEnvirornmentVariables()
+             * returns zero (number of characters (not including terminating null),
+             * and GetLastError() returns ERROR_SUCCESS. */
+            DWORD err = GetLastError();
+            if (err == ERROR_SUCCESS)
+                return env_value;
+
+            if (err != ERROR_ENVVAR_NOT_FOUND)
+                __itt_report_error(__itt_error_cant_read_env, name, (int)err);
+        }
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+        char* env = getenv(name);
+        if (env != NULL)
+        {
+            size_t len = __itt_fstrnlen(env, MAX_ENV_VALUE_SIZE);
+            size_t max_len = MAX_ENV_VALUE_SIZE - (size_t)(env_value - env_buff);
+            if (len < max_len)
+            {
+                const char* ret = (const char*)env_value;
+                __itt_fstrcpyn(env_value, max_len, env, len + 1);
+                env_value += len + 1;
+                return ret;
+            } else
+                __itt_report_error(__itt_error_env_too_long, name, (size_t)len, (size_t)(max_len - 1));
+        }
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    }
+    return NULL;
+}
+
+static const char* __itt_get_lib_name(void)
+{
+    const char* lib_name = __itt_get_env_var(ITT_TO_STR(LIB_VAR_NAME));
+
+#ifdef __ANDROID__
+    if (lib_name == NULL)
+    {
+
+#if ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_ARM
+        const char* const marker_filename = "com.intel.itt.collector_lib_32";
+#else
+        const char* const marker_filename = "com.intel.itt.collector_lib_64";
+#endif
+
+        char system_wide_marker_filename[PATH_MAX] = {0};
+        int itt_marker_file_fd = -1;
+        ssize_t res = 0;
+
+        res = snprintf(system_wide_marker_filename, PATH_MAX - 1, "%s%s", "/data/local/tmp/", marker_filename);
+        if (res < 0)
+        {
+            ITT_ANDROID_LOGE("Unable to concatenate marker file string.");
+            return lib_name;
+        }
+        itt_marker_file_fd = open(system_wide_marker_filename, O_RDONLY);
+
+        if (itt_marker_file_fd == -1)
+        {
+            const pid_t my_pid = getpid();
+            char cmdline_path[PATH_MAX] = {0};
+            char package_name[PATH_MAX] = {0};
+            char app_sandbox_file[PATH_MAX] = {0};
+            int cmdline_fd = 0;
+
+            ITT_ANDROID_LOGI("Unable to open system-wide marker file.");
+            res = snprintf(cmdline_path, PATH_MAX - 1, "/proc/%d/cmdline", my_pid);
+            if (res < 0)
+            {
+                ITT_ANDROID_LOGE("Unable to get cmdline path string.");
+                return lib_name;
+            }
+
+            ITT_ANDROID_LOGI("CMD file: %s\n", cmdline_path);
+            cmdline_fd = open(cmdline_path, O_RDONLY);
+            if (cmdline_fd == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to open %s file!", cmdline_path);
+                return lib_name;
+            }
+            res = read(cmdline_fd, package_name, PATH_MAX - 1);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to read %s file!", cmdline_path);
+                res = close(cmdline_fd);
+                if (res == -1)
+                {
+                    ITT_ANDROID_LOGE("Unable to close %s file!", cmdline_path);
+                }
+                return lib_name;
+            }
+            res = close(cmdline_fd);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to close %s file!", cmdline_path);
+                return lib_name;
+            }
+            ITT_ANDROID_LOGI("Package name: %s\n", package_name);
+            res = snprintf(app_sandbox_file, PATH_MAX - 1, "/data/data/%s/%s", package_name, marker_filename);
+            if (res < 0)
+            {
+                ITT_ANDROID_LOGE("Unable to concatenate marker file string.");
+                return lib_name;
+            }
+
+            ITT_ANDROID_LOGI("Lib marker file name: %s\n", app_sandbox_file);
+            itt_marker_file_fd = open(app_sandbox_file, O_RDONLY);
+            if (itt_marker_file_fd == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to open app marker file!");
+                return lib_name;
+            }
+        }
+
+        {
+            char itt_lib_name[PATH_MAX] = {0};
+
+            res = read(itt_marker_file_fd, itt_lib_name, PATH_MAX - 1);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to read %s file!", itt_marker_file_fd);
+                res = close(itt_marker_file_fd);
+                if (res == -1)
+                {
+                    ITT_ANDROID_LOGE("Unable to close %s file!", itt_marker_file_fd);
+                }
+                return lib_name;
+            }
+            ITT_ANDROID_LOGI("ITT Lib path: %s", itt_lib_name);
+            res = close(itt_marker_file_fd);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to close %s file!", itt_marker_file_fd);
+                return lib_name;
+            }
+            ITT_ANDROID_LOGI("Set env %s to %s", ITT_TO_STR(LIB_VAR_NAME), itt_lib_name);
+            res = setenv(ITT_TO_STR(LIB_VAR_NAME), itt_lib_name, 0);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to set env var!");
+                return lib_name;
+            }
+            lib_name = __itt_get_env_var(ITT_TO_STR(LIB_VAR_NAME));
+            ITT_ANDROID_LOGI("ITT Lib path from env: %s", lib_name);
+        }
+    }
+#endif
+
+    return lib_name;
+}
+
+/* Avoid clashes with std::min */
+#define __itt_min(a,b) ((a) < (b) ? (a) : (b))
+
+static __itt_group_id __itt_get_groups(void)
+{
+    int i;
+    __itt_group_id res = __itt_group_none;
+    const char* var_name  = "INTEL_ITTNOTIFY_GROUPS";
+    const char* group_str = __itt_get_env_var(var_name);
+
+    if (group_str != NULL)
+    {
+        int len;
+        char gr[255];
+        const char* chunk;
+        while ((group_str = __itt_fsplit(group_str, ",; ", &chunk, &len)) != NULL)
+        {
+            int min_len = __itt_min(len, (int)(sizeof(gr) - 1));
+            __itt_fstrcpyn(gr, sizeof(gr) - 1, chunk,  min_len);
+            gr[min_len] = 0;
+
+            for (i = 0; group_list[i].name != NULL; i++)
+            {
+                if (!__itt_fstrcmp(gr, group_list[i].name))
+                {
+                    res = (__itt_group_id)(res | group_list[i].id);
+                    break;
+                }
+            }
+        }
+        /* TODO: !!! Workaround for bug with warning for unknown group !!!
+         * Should be fixed in new initialization scheme.
+         * Now the following groups should be set always. */
+        for (i = 0; group_list[i].id != __itt_group_none; i++)
+            if (group_list[i].id != __itt_group_all &&
+                group_list[i].id > __itt_group_splitter_min &&
+                group_list[i].id < __itt_group_splitter_max)
+                res = (__itt_group_id)(res | group_list[i].id);
+        return res;
+    }
+    else
+    {
+        for (i = 0; group_alias[i].env_var != NULL; i++)
+            if (__itt_get_env_var(group_alias[i].env_var) != NULL)
+                return group_alias[i].groups;
+    }
+
+    return res;
+}
+
+#undef __itt_min
+
+static int __itt_lib_version(lib_t lib)
+{
+    if (lib == NULL)
+        return 0;
+    if (__itt_get_proc(lib, "__itt_api_init"))
+        return 2;
+    if (__itt_get_proc(lib, "__itt_api_version"))
+        return 1;
+    return 0;
+}
+
+/* It's not used right now! Comment it out to avoid warnings.
+static void __itt_reinit_all_pointers(void)
+{
+    register int i;
+    // Fill all pointers with initial stubs
+    for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
+        *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].init_func;
+}
+*/
+
+static void __itt_nullify_all_pointers(void)
+{
+    int i;
+    /* Nulify all pointers except domain_create, string_handle_create  and counter_create */
+    for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
+        *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#pragma warning(push)
+#pragma warning(disable: 4054) /* warning C4054: 'type cast' : from function pointer 'XXX' to data pointer 'void *' */
+#pragma warning(disable: 4055) /* warning C4055: 'type cast' : from data pointer 'void *' to function pointer 'XXX' */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+ITT_EXTERN_C void _N_(fini_ittlib)(void)
+{
+    __itt_api_fini_t* __itt_api_fini_ptr = NULL;
+    static volatile TIDT current_thread = 0;
+
+    if (_N_(_ittapi_global).api_initialized)
+    {
+        ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+        if (_N_(_ittapi_global).api_initialized)
+        {
+            if (current_thread == 0)
+            {
+                if (PTHREAD_SYMBOLS) current_thread = __itt_thread_id();
+                if (_N_(_ittapi_global).lib != NULL)
+                {
+                    __itt_api_fini_ptr = (__itt_api_fini_t*)(size_t)__itt_get_proc(_N_(_ittapi_global).lib, "__itt_api_fini");
+                }
+                if (__itt_api_fini_ptr)
+                {
+                    __itt_api_fini_ptr(&_N_(_ittapi_global));
+                }
+
+                __itt_nullify_all_pointers();
+
+ /* TODO: !!! not safe !!! don't support unload so far.
+  *             if (_N_(_ittapi_global).lib != NULL)
+  *                 __itt_unload_lib(_N_(_ittapi_global).lib);
+  *             _N_(_ittapi_global).lib = NULL;
+  */
+                _N_(_ittapi_global).api_initialized = 0;
+                current_thread = 0;
+            }
+        }
+        if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    }
+}
+
+ITT_EXTERN_C int _N_(init_ittlib)(const char* lib_name, __itt_group_id init_groups)
+{
+    int i;
+    __itt_group_id groups;
+#ifdef ITT_COMPLETE_GROUP
+    __itt_group_id zero_group = __itt_group_none;
+#endif /* ITT_COMPLETE_GROUP */
+    static volatile TIDT current_thread = 0;
+
+    if (!_N_(_ittapi_global).api_initialized)
+    {
+#ifndef ITT_SIMPLE_INIT
+        ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+#endif /* ITT_SIMPLE_INIT */
+
+        if (!_N_(_ittapi_global).api_initialized)
+        {
+            if (current_thread == 0)
+            {
+                if (PTHREAD_SYMBOLS) current_thread = __itt_thread_id();
+                if (lib_name == NULL)
+                {
+                    lib_name = __itt_get_lib_name();
+                }
+                groups = __itt_get_groups();
+                if (DL_SYMBOLS && (groups != __itt_group_none || lib_name != NULL))
+                {
+                    _N_(_ittapi_global).lib = __itt_load_lib((lib_name == NULL) ? ittnotify_lib_name : lib_name);
+
+                    if (_N_(_ittapi_global).lib != NULL)
+                    {
+                        __itt_api_init_t* __itt_api_init_ptr;
+                        int lib_version = __itt_lib_version(_N_(_ittapi_global).lib);
+
+                        switch (lib_version) {
+                        case 0:
+                            groups = __itt_group_legacy;
+                            /* Falls through */
+                        case 1:
+                            /* Fill all pointers from dynamic library */
+                            for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
+                            {
+                                if (_N_(_ittapi_global).api_list_ptr[i].group & groups & init_groups)
+                                {
+                                    *_N_(_ittapi_global).api_list_ptr[i].func_ptr = (void*)__itt_get_proc(_N_(_ittapi_global).lib, _N_(_ittapi_global).api_list_ptr[i].name);
+                                    if (*_N_(_ittapi_global).api_list_ptr[i].func_ptr == NULL)
+                                    {
+                                        /* Restore pointers for function with static implementation */
+                                        *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
+                                        __itt_report_error(__itt_error_no_symbol, lib_name, _N_(_ittapi_global).api_list_ptr[i].name);
+#ifdef ITT_COMPLETE_GROUP
+                                        zero_group = (__itt_group_id)(zero_group | _N_(_ittapi_global).api_list_ptr[i].group);
+#endif /* ITT_COMPLETE_GROUP */
+                                    }
+                                }
+                                else
+                                    *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
+                            }
+
+                            if (groups == __itt_group_legacy)
+                            {
+                                /* Compatibility with legacy tools */
+                                ITTNOTIFY_NAME(thread_ignore)  = ITTNOTIFY_NAME(thr_ignore);
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+                                ITTNOTIFY_NAME(sync_createA)   = ITTNOTIFY_NAME(sync_set_nameA);
+                                ITTNOTIFY_NAME(sync_createW)   = ITTNOTIFY_NAME(sync_set_nameW);
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+                                ITTNOTIFY_NAME(sync_create)    = ITTNOTIFY_NAME(sync_set_name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+                                ITTNOTIFY_NAME(sync_prepare)   = ITTNOTIFY_NAME(notify_sync_prepare);
+                                ITTNOTIFY_NAME(sync_cancel)    = ITTNOTIFY_NAME(notify_sync_cancel);
+                                ITTNOTIFY_NAME(sync_acquired)  = ITTNOTIFY_NAME(notify_sync_acquired);
+                                ITTNOTIFY_NAME(sync_releasing) = ITTNOTIFY_NAME(notify_sync_releasing);
+                            }
+
+#ifdef ITT_COMPLETE_GROUP
+                            for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
+                                if (_N_(_ittapi_global).api_list_ptr[i].group & zero_group)
+                                    *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
+#endif /* ITT_COMPLETE_GROUP */
+                            break;
+                        case 2:
+                            __itt_api_init_ptr = (__itt_api_init_t*)(size_t)__itt_get_proc(_N_(_ittapi_global).lib, "__itt_api_init");
+                            if (__itt_api_init_ptr)
+                                __itt_api_init_ptr(&_N_(_ittapi_global), init_groups);
+                            break;
+                        }
+                    }
+                    else
+                    {
+                        __itt_nullify_all_pointers();
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+                        int error = __itt_system_error();
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+                        const char* error = dlerror();
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+                        __itt_report_error(__itt_error_no_module, lib_name, error);
+                    }
+                }
+                else
+                {
+                    __itt_nullify_all_pointers();
+                }
+                _N_(_ittapi_global).api_initialized = 1;
+                current_thread = 0;
+                /* !!! Just to avoid unused code elimination !!! */
+                if (__itt_fini_ittlib_ptr == _N_(fini_ittlib)) current_thread = 0;
+            }
+        }
+
+#ifndef ITT_SIMPLE_INIT
+        if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+#endif /* ITT_SIMPLE_INIT */
+    }
+
+    /* Evaluating if any function ptr is non empty and it's in init_groups */
+    for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
+    {
+        if (*_N_(_ittapi_global).api_list_ptr[i].func_ptr != _N_(_ittapi_global).api_list_ptr[i].null_func &&
+            _N_(_ittapi_global).api_list_ptr[i].group & init_groups)
+        {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+ITT_EXTERN_C __itt_error_handler_t* _N_(set_error_handler)(__itt_error_handler_t* handler)
+{
+    __itt_error_handler_t* prev = (__itt_error_handler_t*)(size_t)_N_(_ittapi_global).error_handler;
+    _N_(_ittapi_global).error_handler = (void*)(size_t)handler;
+    return prev;
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#pragma warning(pop)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** __itt_mark_pt_region functions marks region of interest
+ * region parameter defines different regions.
+ * 0 <= region < 8 */
+
+#if defined(ITT_API_IPT_SUPPORT) && (ITT_PLATFORM==ITT_PLATFORM_WIN || ITT_PLATFORM==ITT_PLATFORM_POSIX) && !defined(__ANDROID__)
+void __itt_pt_mark(__itt_pt_region region);
+void __itt_pt_mark_event(__itt_pt_region region);
+#endif
+
+ITT_EXTERN_C void _N_(mark_pt_region_begin)(__itt_pt_region region)
+{
+#if defined(ITT_API_IPT_SUPPORT) && (ITT_PLATFORM==ITT_PLATFORM_WIN || ITT_PLATFORM==ITT_PLATFORM_POSIX) && !defined(__ANDROID__)
+    if (_N_(_ittapi_global).ipt_collect_events == 1)
+    {
+        __itt_pt_mark_event(2*region);
+    }
+    else
+    {
+        __itt_pt_mark(2*region);
+    }
+#else
+    (void)region;
+#endif
+}
+
+ITT_EXTERN_C void _N_(mark_pt_region_end)(__itt_pt_region region)
+{
+#if defined(ITT_API_IPT_SUPPORT) && (ITT_PLATFORM==ITT_PLATFORM_WIN || ITT_PLATFORM==ITT_PLATFORM_POSIX) && !defined(__ANDROID__)
+    if (_N_(_ittapi_global).ipt_collect_events == 1)
+    {
+        __itt_pt_mark_event(2*region + 1);
+    }
+    else
+    {
+        __itt_pt_mark(2*region + 1);
+    }
+#else
+     (void)region;
+#endif
+}
+
diff --git a/contrib/libs/tbb/src/tbb/tools_api/ittnotify_static.h b/contrib/libs/tbb/src/tbb/tools_api/ittnotify_static.h
new file mode 100644
index 0000000000..67cf683880
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/tools_api/ittnotify_static.h
@@ -0,0 +1,354 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "ittnotify_config.h"
+
+#ifndef ITT_FORMAT_DEFINED
+#  ifndef ITT_FORMAT
+#    define ITT_FORMAT
+#  endif /* ITT_FORMAT */
+#  ifndef ITT_NO_PARAMS
+#    define ITT_NO_PARAMS
+#  endif /* ITT_NO_PARAMS */
+#endif /* ITT_FORMAT_DEFINED */
+
+/*
+ * parameters for macro expected:
+ * ITT_STUB(api, type, func_name, arguments, params, func_name_in_dll, group, printf_fmt)
+ */
+#ifdef __ITT_INTERNAL_INIT
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_domain*, domain_createA, (const char    *name), (ITT_FORMAT name), domain_createA, __itt_group_structure, "\"%s\"")
+ITT_STUB(ITTAPI, __itt_domain*, domain_createW, (const wchar_t *name), (ITT_FORMAT name), domain_createW, __itt_group_structure, "\"%S\"")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_domain*, domain_create,  (const char    *name), (ITT_FORMAT name), domain_create,  __itt_group_structure, "\"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createA, (const char    *name), (ITT_FORMAT name), string_handle_createA, __itt_group_structure, "\"%s\"")
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createW, (const wchar_t *name), (ITT_FORMAT name), string_handle_createW, __itt_group_structure, "\"%S\"")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_create,  (const char    *name), (ITT_FORMAT name), string_handle_create,  __itt_group_structure, "\"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_counter, counter_createA, (const char    *name, const char    *domain), (ITT_FORMAT name, domain), counter_createA, __itt_group_counter, "\"%s\", \"%s\"")
+ITT_STUB(ITTAPI, __itt_counter, counter_createW, (const wchar_t *name, const wchar_t *domain), (ITT_FORMAT name, domain), counter_createW, __itt_group_counter, "\"%s\", \"%s\"")
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_counter, counter_create,  (const char    *name, const char    *domain), (ITT_FORMAT name, domain), counter_create,  __itt_group_counter, "\"%s\", \"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typedA, (const char    *name, const char    *domain, __itt_metadata_type type), (ITT_FORMAT name, domain, type), counter_create_typedA, __itt_group_counter, "\"%s\", \"%s\", %d")
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typedW, (const wchar_t *name, const wchar_t *domain, __itt_metadata_type type), (ITT_FORMAT name, domain, type), counter_create_typedW, __itt_group_counter, "\"%s\", \"%s\", %d")
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typed,  (const char    *name, const char    *domain, __itt_metadata_type type), (ITT_FORMAT name, domain, type), counter_create_typed,  __itt_group_counter, "\"%s\", \"%s\", %d")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+
+ITT_STUBV(ITTAPI, void, pause,  (void), (ITT_NO_PARAMS), pause,  __itt_group_control | __itt_group_legacy, "no args")
+ITT_STUBV(ITTAPI, void, resume, (void), (ITT_NO_PARAMS), resume, __itt_group_control | __itt_group_legacy, "no args")
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, thread_set_nameA, (const char    *name), (ITT_FORMAT name), thread_set_nameA, __itt_group_thread, "\"%s\"")
+ITT_STUBV(ITTAPI, void, thread_set_nameW, (const wchar_t *name), (ITT_FORMAT name), thread_set_nameW, __itt_group_thread, "\"%S\"")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, thread_set_name,  (const char    *name), (ITT_FORMAT name), thread_set_name,  __itt_group_thread, "\"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, thread_ignore, (void), (ITT_NO_PARAMS), thread_ignore, __itt_group_thread, "no args")
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, int,  thr_name_setA, (const char    *name, int namelen), (ITT_FORMAT name, namelen), thr_name_setA, __itt_group_thread | __itt_group_legacy, "\"%s\", %d")
+ITT_STUB(LIBITTAPI, int,  thr_name_setW, (const wchar_t *name, int namelen), (ITT_FORMAT name, namelen), thr_name_setW, __itt_group_thread | __itt_group_legacy, "\"%S\", %d")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, int,  thr_name_set,  (const char    *name, int namelen), (ITT_FORMAT name, namelen), thr_name_set,  __itt_group_thread | __itt_group_legacy, "\"%s\", %d")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(LIBITTAPI, void, thr_ignore,   (void),                             (ITT_NO_PARAMS),            thr_ignore,    __itt_group_thread | __itt_group_legacy, "no args")
+#endif /* __ITT_INTERNAL_BODY */
+
+ITT_STUBV(ITTAPI, void, enable_attach, (void), (ITT_NO_PARAMS), enable_attach, __itt_group_all, "no args")
+
+#else  /* __ITT_INTERNAL_INIT */
+
+ITT_STUBV(ITTAPI, void, detach, (void), (ITT_NO_PARAMS), detach, __itt_group_control | __itt_group_legacy, "no args")
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_createA, (void *addr, const char    *objtype, const char    *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_createA, __itt_group_sync | __itt_group_fsync, "%p, \"%s\", \"%s\", %x")
+ITT_STUBV(ITTAPI, void, sync_createW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_createW, __itt_group_sync | __itt_group_fsync, "%p, \"%S\", \"%S\", %x")
+ITT_STUBV(ITTAPI, void, sync_renameA, (void *addr, const char    *name), (ITT_FORMAT addr, name), sync_renameA, __itt_group_sync | __itt_group_fsync, "%p, \"%s\"")
+ITT_STUBV(ITTAPI, void, sync_renameW, (void *addr, const wchar_t *name), (ITT_FORMAT addr, name), sync_renameW, __itt_group_sync | __itt_group_fsync, "%p, \"%S\"")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_create,  (void *addr, const char    *objtype, const char    *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_create,  __itt_group_sync | __itt_group_fsync, "%p, \"%s\", \"%s\", %x")
+ITT_STUBV(ITTAPI, void, sync_rename,  (void *addr, const char    *name), (ITT_FORMAT addr, name), sync_rename,  __itt_group_sync | __itt_group_fsync, "%p, \"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_destroy,    (void *addr), (ITT_FORMAT addr), sync_destroy,   __itt_group_sync | __itt_group_fsync, "%p")
+
+ITT_STUBV(ITTAPI, void, sync_prepare,    (void* addr), (ITT_FORMAT addr), sync_prepare,   __itt_group_sync,  "%p")
+ITT_STUBV(ITTAPI, void, sync_cancel,     (void *addr), (ITT_FORMAT addr), sync_cancel,    __itt_group_sync,  "%p")
+ITT_STUBV(ITTAPI, void, sync_acquired,   (void *addr), (ITT_FORMAT addr), sync_acquired,  __itt_group_sync,  "%p")
+ITT_STUBV(ITTAPI, void, sync_releasing,  (void* addr), (ITT_FORMAT addr), sync_releasing, __itt_group_sync,  "%p")
+
+ITT_STUBV(ITTAPI, void, suppress_push,       (unsigned int mask),                             (ITT_FORMAT mask), suppress_push,  __itt_group_suppress,  "%p")
+ITT_STUBV(ITTAPI, void, suppress_pop,        (void),                                          (ITT_NO_PARAMS),   suppress_pop,   __itt_group_suppress,  "no args")
+ITT_STUBV(ITTAPI, void, suppress_mark_range, (__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size),(ITT_FORMAT mode, mask, address, size), suppress_mark_range, __itt_group_suppress, "%d, %p, %p, %d")
+ITT_STUBV(ITTAPI, void, suppress_clear_range,(__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size),(ITT_FORMAT mode, mask, address, size), suppress_clear_range,__itt_group_suppress, "%d, %p, %p, %d")
+
+ITT_STUBV(ITTAPI, void, fsync_prepare,   (void* addr), (ITT_FORMAT addr), sync_prepare,   __itt_group_fsync, "%p")
+ITT_STUBV(ITTAPI, void, fsync_cancel,    (void *addr), (ITT_FORMAT addr), sync_cancel,    __itt_group_fsync, "%p")
+ITT_STUBV(ITTAPI, void, fsync_acquired,  (void *addr), (ITT_FORMAT addr), sync_acquired,  __itt_group_fsync, "%p")
+ITT_STUBV(ITTAPI, void, fsync_releasing, (void* addr), (ITT_FORMAT addr), sync_releasing, __itt_group_fsync, "%p")
+
+ITT_STUBV(ITTAPI, void, model_site_begin,          (__itt_model_site *site, __itt_model_site_instance *instance, const char *name), (ITT_FORMAT site, instance, name), model_site_begin, __itt_group_model, "%p, %p, \"%s\"")
+ITT_STUBV(ITTAPI, void, model_site_end,            (__itt_model_site *site, __itt_model_site_instance *instance),                   (ITT_FORMAT site, instance),       model_site_end,   __itt_group_model, "%p, %p")
+ITT_STUBV(ITTAPI, void, model_task_begin,          (__itt_model_task *task, __itt_model_task_instance *instance, const char *name), (ITT_FORMAT task, instance, name), model_task_begin, __itt_group_model, "%p, %p, \"%s\"")
+ITT_STUBV(ITTAPI, void, model_task_end,            (__itt_model_task *task, __itt_model_task_instance *instance),                   (ITT_FORMAT task, instance),       model_task_end,   __itt_group_model, "%p, %p")
+ITT_STUBV(ITTAPI, void, model_lock_acquire,        (void *lock), (ITT_FORMAT lock), model_lock_acquire, __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_lock_release,        (void *lock), (ITT_FORMAT lock), model_lock_release, __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_record_allocation,   (void *addr, size_t size), (ITT_FORMAT addr, size), model_record_allocation,   __itt_group_model, "%p, %d")
+ITT_STUBV(ITTAPI, void, model_record_deallocation, (void *addr),              (ITT_FORMAT addr),       model_record_deallocation, __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_induction_uses,      (void* addr, size_t size), (ITT_FORMAT addr, size), model_induction_uses,      __itt_group_model, "%p, %d")
+ITT_STUBV(ITTAPI, void, model_reduction_uses,      (void* addr, size_t size), (ITT_FORMAT addr, size), model_reduction_uses,      __itt_group_model, "%p, %d")
+ITT_STUBV(ITTAPI, void, model_observe_uses,        (void* addr, size_t size), (ITT_FORMAT addr, size), model_observe_uses,        __itt_group_model, "%p, %d")
+ITT_STUBV(ITTAPI, void, model_clear_uses,          (void* addr),              (ITT_FORMAT addr),       model_clear_uses,          __itt_group_model, "%p")
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, model_site_beginW,         (const wchar_t *name),     (ITT_FORMAT name),       model_site_beginW,         __itt_group_model, "\"%s\"")
+ITT_STUBV(ITTAPI, void, model_task_beginW,         (const wchar_t *name),     (ITT_FORMAT name),       model_task_beginW,         __itt_group_model, "\"%s\"")
+ITT_STUBV(ITTAPI, void, model_iteration_taskW,     (const wchar_t *name),     (ITT_FORMAT name),       model_iteration_taskW,     __itt_group_model, "\"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, model_site_beginA,         (const char *name),        (ITT_FORMAT name),       model_site_beginA,         __itt_group_model, "\"%s\"")
+ITT_STUBV(ITTAPI, void, model_site_beginAL,        (const char *name, size_t len), (ITT_FORMAT name, len), model_site_beginAL,    __itt_group_model, "\"%s\", %d")
+ITT_STUBV(ITTAPI, void, model_task_beginA,         (const char *name),        (ITT_FORMAT name),       model_task_beginA,         __itt_group_model, "\"%s\"")
+ITT_STUBV(ITTAPI, void, model_task_beginAL,        (const char *name, size_t len), (ITT_FORMAT name, len), model_task_beginAL,    __itt_group_model, "\"%s\", %d")
+ITT_STUBV(ITTAPI, void, model_iteration_taskA,     (const char *name),        (ITT_FORMAT name),       model_iteration_taskA,     __itt_group_model, "\"%s\"")
+ITT_STUBV(ITTAPI, void, model_iteration_taskAL,    (const char *name, size_t len), (ITT_FORMAT name, len), model_iteration_taskAL, __itt_group_model, "\"%s\", %d")
+ITT_STUBV(ITTAPI, void, model_site_end_2,          (void),                    (ITT_NO_PARAMS),         model_site_end_2,          __itt_group_model, "no args")
+ITT_STUBV(ITTAPI, void, model_task_end_2,          (void),                    (ITT_NO_PARAMS),         model_task_end_2,          __itt_group_model, "no args")
+ITT_STUBV(ITTAPI, void, model_lock_acquire_2,      (void *lock),              (ITT_FORMAT lock),       model_lock_acquire_2,      __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_lock_release_2,      (void *lock),              (ITT_FORMAT lock),       model_lock_release_2,      __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_aggregate_task,      (size_t count),            (ITT_FORMAT count),      model_aggregate_task,      __itt_group_model, "%d")
+ITT_STUBV(ITTAPI, void, model_disable_push,        (__itt_model_disable x),   (ITT_FORMAT x),          model_disable_push,        __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_disable_pop,         (void),                    (ITT_NO_PARAMS),         model_disable_pop,         __itt_group_model, "no args")
+#endif /* __ITT_INTERNAL_BODY */
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createA, (const char    *name, const char    *domain), (ITT_FORMAT name, domain), heap_function_createA, __itt_group_heap, "\"%s\", \"%s\"")
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createW, (const wchar_t *name, const wchar_t *domain), (ITT_FORMAT name, domain), heap_function_createW, __itt_group_heap, "\"%s\", \"%s\"")
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_create,  (const char    *name, const char    *domain), (ITT_FORMAT name, domain), heap_function_create,  __itt_group_heap, "\"%s\", \"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* __ITT_INTERNAL_BODY */
+ITT_STUBV(ITTAPI, void, heap_allocate_begin,   (__itt_heap_function h, size_t size, int initialized),             (ITT_FORMAT h, size, initialized),       heap_allocate_begin, __itt_group_heap, "%p, %lu, %d")
+ITT_STUBV(ITTAPI, void, heap_allocate_end,     (__itt_heap_function h, void** addr, size_t size, int initialized), (ITT_FORMAT h, addr, size, initialized), heap_allocate_end,   __itt_group_heap, "%p, %p, %lu, %d")
+ITT_STUBV(ITTAPI, void, heap_free_begin,       (__itt_heap_function h, void*  addr), (ITT_FORMAT h, addr), heap_free_begin, __itt_group_heap, "%p, %p")
+ITT_STUBV(ITTAPI, void, heap_free_end,         (__itt_heap_function h, void*  addr), (ITT_FORMAT h, addr), heap_free_end,   __itt_group_heap, "%p, %p")
+ITT_STUBV(ITTAPI, void, heap_reallocate_begin, (__itt_heap_function h, void*  addr, size_t new_size, int initialized),                  (ITT_FORMAT h, addr, new_size, initialized),           heap_reallocate_begin, __itt_group_heap, "%p, %p, %lu, %d")
+ITT_STUBV(ITTAPI, void, heap_reallocate_end,   (__itt_heap_function h, void*  addr, void** new_addr, size_t new_size, int initialized), (ITT_FORMAT h, addr, new_addr, new_size, initialized), heap_reallocate_end,   __itt_group_heap, "%p, %p, %p, %lu, %d")
+ITT_STUBV(ITTAPI, void, heap_internal_access_begin, (void), (ITT_NO_PARAMS), heap_internal_access_begin, __itt_group_heap, "no args")
+ITT_STUBV(ITTAPI, void, heap_internal_access_end,   (void), (ITT_NO_PARAMS), heap_internal_access_end,   __itt_group_heap, "no args")
+ITT_STUBV(ITTAPI, void, heap_record_memory_growth_begin, (void), (ITT_NO_PARAMS), heap_record_memory_growth_begin, __itt_group_heap, "no args")
+ITT_STUBV(ITTAPI, void, heap_record_memory_growth_end,   (void), (ITT_NO_PARAMS), heap_record_memory_growth_end,   __itt_group_heap, "no args")
+ITT_STUBV(ITTAPI, void, heap_reset_detection, (unsigned int reset_mask),  (ITT_FORMAT reset_mask), heap_reset_detection, __itt_group_heap, "%u")
+ITT_STUBV(ITTAPI, void, heap_record,          (unsigned int record_mask), (ITT_FORMAT record_mask),  heap_record,        __itt_group_heap, "%u")
+
+ITT_STUBV(ITTAPI, void, id_create,  (const __itt_domain *domain, __itt_id id), (ITT_FORMAT domain, id), id_create,  __itt_group_structure, "%p, %lu")
+ITT_STUBV(ITTAPI, void, id_destroy, (const __itt_domain *domain, __itt_id id), (ITT_FORMAT domain, id), id_destroy, __itt_group_structure, "%p, %lu")
+
+ITT_STUB(ITTAPI, __itt_timestamp, get_timestamp, (void), (ITT_NO_PARAMS), get_timestamp,  __itt_group_structure, "no args")
+
+ITT_STUBV(ITTAPI, void, region_begin, (const __itt_domain *domain, __itt_id id, __itt_id parent, __itt_string_handle *name), (ITT_FORMAT domain, id, parent, name), region_begin, __itt_group_structure, "%p, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, region_end,   (const __itt_domain *domain, __itt_id id),                                             (ITT_FORMAT domain, id),               region_end,   __itt_group_structure, "%p, %lu")
+
+#ifndef __ITT_INTERNAL_BODY
+ITT_STUBV(ITTAPI, void, frame_begin_v3,  (const __itt_domain *domain, __itt_id *id),                                             (ITT_FORMAT domain, id),             frame_begin_v3,  __itt_group_structure, "%p, %p")
+ITT_STUBV(ITTAPI, void, frame_end_v3,    (const __itt_domain *domain, __itt_id *id),                                             (ITT_FORMAT domain, id),             frame_end_v3,    __itt_group_structure, "%p, %p")
+ITT_STUBV(ITTAPI, void, frame_submit_v3, (const __itt_domain *domain, __itt_id *id, __itt_timestamp begin, __itt_timestamp end), (ITT_FORMAT domain, id, begin, end), frame_submit_v3, __itt_group_structure, "%p, %p, %lu, %lu")
+#endif /* __ITT_INTERNAL_BODY */
+
+ITT_STUBV(ITTAPI, void, task_group,   (const __itt_domain *domain, __itt_id id, __itt_id parent, __itt_string_handle *name), (ITT_FORMAT domain, id, parent, name), task_group,  __itt_group_structure, "%p, %lu, %lu, %p")
+
+ITT_STUBV(ITTAPI, void, task_begin,    (const __itt_domain *domain, __itt_id id, __itt_id parent, __itt_string_handle *name), (ITT_FORMAT domain, id, parent, name), task_begin,    __itt_group_structure, "%p, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_begin_fn, (const __itt_domain *domain, __itt_id id, __itt_id parent, void* fn),                  (ITT_FORMAT domain, id, parent, fn),   task_begin_fn, __itt_group_structure, "%p, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_end,      (const __itt_domain *domain),                                                          (ITT_FORMAT domain),                   task_end,      __itt_group_structure, "%p")
+
+ITT_STUBV(ITTAPI, void, counter_inc_v3,       (const __itt_domain *domain, __itt_string_handle *name),                           (ITT_FORMAT domain, name),        counter_inc_v3,       __itt_group_structure, "%p, %p")
+ITT_STUBV(ITTAPI, void, counter_inc_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long value), (ITT_FORMAT domain, name, value), counter_inc_delta_v3, __itt_group_structure, "%p, %p, %lu")
+ITT_STUBV(ITTAPI, void, counter_dec_v3,       (const __itt_domain *domain, __itt_string_handle *name),                           (ITT_FORMAT domain, name),        counter_dec_v3,       __itt_group_structure, "%p, %p")
+ITT_STUBV(ITTAPI, void, counter_dec_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long value), (ITT_FORMAT domain, name, value), counter_dec_delta_v3, __itt_group_structure, "%p, %p, %lu")
+
+ITT_STUBV(ITTAPI, void, marker, (const __itt_domain *domain, __itt_id id, __itt_string_handle *name, __itt_scope scope), (ITT_FORMAT domain, id, name, scope), marker, __itt_group_structure, "%p, %lu, %p, %d")
+
+ITT_STUBV(ITTAPI, void, metadata_add,      (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data), (ITT_FORMAT domain, id, key, type, count, data), metadata_add, __itt_group_structure, "%p, %lu, %p, %d, %lu, %p")
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, metadata_str_addA, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char* data, size_t length),    (ITT_FORMAT domain, id, key, data, length), metadata_str_addA, __itt_group_structure, "%p, %lu, %p, %p, %lu")
+ITT_STUBV(ITTAPI, void, metadata_str_addW, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const wchar_t* data, size_t length), (ITT_FORMAT domain, id, key, data, length), metadata_str_addW, __itt_group_structure, "%p, %lu, %p, %p, %lu")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, metadata_str_add,  (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char* data, size_t length),    (ITT_FORMAT domain, id, key, data, length), metadata_str_add,  __itt_group_structure, "%p, %lu, %p, %p, %lu")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+ITT_STUBV(ITTAPI, void, relation_add_to_current, (const __itt_domain *domain, __itt_relation relation, __itt_id tail),                (ITT_FORMAT domain, relation, tail),       relation_add_to_current, __itt_group_structure, "%p, %lu, %p")
+ITT_STUBV(ITTAPI, void, relation_add,            (const __itt_domain *domain, __itt_id head, __itt_relation relation, __itt_id tail), (ITT_FORMAT domain, head, relation, tail), relation_add,            __itt_group_structure, "%p, %p, %lu, %p")
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, __itt_event, event_createA, (const char    *name, int namelen), (ITT_FORMAT name, namelen), event_createA, __itt_group_mark | __itt_group_legacy, "\"%s\", %d")
+ITT_STUB(LIBITTAPI, __itt_event, event_createW, (const wchar_t *name, int namelen), (ITT_FORMAT name, namelen), event_createW, __itt_group_mark | __itt_group_legacy, "\"%S\", %d")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, __itt_event, event_create,  (const char    *name, int namelen), (ITT_FORMAT name, namelen), event_create,  __itt_group_mark | __itt_group_legacy, "\"%s\", %d")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, int,  event_start,          (__itt_event event),                (ITT_FORMAT event),         event_start,   __itt_group_mark | __itt_group_legacy, "%d")
+ITT_STUB(LIBITTAPI, int,  event_end,            (__itt_event event),                (ITT_FORMAT event),         event_end,     __itt_group_mark | __itt_group_legacy, "%d")
+#endif /* __ITT_INTERNAL_BODY */
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_set_nameA, (void *addr, const char    *objtype, const char    *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_set_nameA, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%s\", \"%s\", %x")
+ITT_STUBV(ITTAPI, void, sync_set_nameW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_set_nameW, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%S\", \"%S\", %x")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_set_name,  (void *addr, const char    *objtype, const char    *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_set_name,  __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "p, \"%s\", \"%s\", %x")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, int, notify_sync_nameA, (void *p, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute), (ITT_FORMAT p, objtype, typelen, objname, namelen, attribute), notify_sync_nameA, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%s\", %d, \"%s\", %d, %x")
+ITT_STUB(LIBITTAPI, int, notify_sync_nameW, (void *p, const wchar_t *objtype, int typelen, const wchar_t *objname, int namelen, int attribute), (ITT_FORMAT p, objtype, typelen, objname, namelen, attribute), notify_sync_nameW, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%S\", %d, \"%S\", %d, %x")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, int, notify_sync_name,  (void *p, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute), (ITT_FORMAT p, objtype, typelen, objname, namelen, attribute), notify_sync_name,  __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%s\", %d, \"%s\", %d, %x")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+ITT_STUBV(LIBITTAPI, void, notify_sync_prepare,   (void *p), (ITT_FORMAT p), notify_sync_prepare,   __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p")
+ITT_STUBV(LIBITTAPI, void, notify_sync_cancel,    (void *p), (ITT_FORMAT p), notify_sync_cancel,    __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p")
+ITT_STUBV(LIBITTAPI, void, notify_sync_acquired,  (void *p), (ITT_FORMAT p), notify_sync_acquired,  __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p")
+ITT_STUBV(LIBITTAPI, void, notify_sync_releasing, (void *p), (ITT_FORMAT p), notify_sync_releasing, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p")
+#endif /* __ITT_INTERNAL_BODY */
+
+ITT_STUBV(LIBITTAPI, void, memory_read,   (void *addr, size_t size), (ITT_FORMAT addr, size), memory_read,   __itt_group_legacy, "%p, %lu")
+ITT_STUBV(LIBITTAPI, void, memory_write,  (void *addr, size_t size), (ITT_FORMAT addr, size), memory_write,  __itt_group_legacy, "%p, %lu")
+ITT_STUBV(LIBITTAPI, void, memory_update, (void *addr, size_t size), (ITT_FORMAT addr, size), memory_update, __itt_group_legacy, "%p, %lu")
+
+ITT_STUB(LIBITTAPI, __itt_state_t,     state_get,    (void),                                    (ITT_NO_PARAMS),   state_get,    __itt_group_legacy, "no args")
+ITT_STUB(LIBITTAPI, __itt_state_t,     state_set,    (__itt_state_t s),                         (ITT_FORMAT s),    state_set,    __itt_group_legacy, "%d")
+ITT_STUB(LIBITTAPI, __itt_obj_state_t, obj_mode_set, (__itt_obj_prop_t p, __itt_obj_state_t s), (ITT_FORMAT p, s), obj_mode_set, __itt_group_legacy, "%d, %d")
+ITT_STUB(LIBITTAPI, __itt_thr_state_t, thr_mode_set, (__itt_thr_prop_t p, __itt_thr_state_t s), (ITT_FORMAT p, s), thr_mode_set, __itt_group_legacy, "%d, %d")
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_frame, frame_createA, (const char    *domain), (ITT_FORMAT domain), frame_createA, __itt_group_frame, "\"%s\"")
+ITT_STUB(ITTAPI, __itt_frame, frame_createW, (const wchar_t *domain), (ITT_FORMAT domain), frame_createW, __itt_group_frame, "\"%s\"")
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_frame, frame_create,  (const char    *domain), (ITT_FORMAT domain), frame_create,  __itt_group_frame, "\"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_pt_region, pt_region_createA, (const char    *name), (ITT_FORMAT name), pt_region_createA, __itt_group_structure, "\"%s\"")
+ITT_STUB(ITTAPI, __itt_pt_region, pt_region_createW, (const wchar_t *name), (ITT_FORMAT name), pt_region_createW, __itt_group_structure, "\"%S\"")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_pt_region, pt_region_create,  (const char    *name), (ITT_FORMAT name), pt_region_create,  __itt_group_structure, "\"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* __ITT_INTERNAL_BODY */
+ITT_STUBV(ITTAPI, void, frame_begin,         (__itt_frame frame),     (ITT_FORMAT frame),  frame_begin,   __itt_group_frame, "%p")
+ITT_STUBV(ITTAPI, void, frame_end,           (__itt_frame frame),     (ITT_FORMAT frame),  frame_end,     __itt_group_frame, "%p")
+
+ITT_STUBV(ITTAPI, void, counter_destroy,      (__itt_counter id),                                                                                  (ITT_FORMAT id),        counter_destroy,   __itt_group_counter, "%p")
+ITT_STUBV(ITTAPI, void, counter_inc,          (__itt_counter id),                                                                                  (ITT_FORMAT id),        counter_inc,       __itt_group_counter, "%p")
+ITT_STUBV(ITTAPI, void, counter_inc_delta,    (__itt_counter id, unsigned long long value),                                                        (ITT_FORMAT id, value), counter_inc_delta, __itt_group_counter, "%p, %lu")
+ITT_STUBV(ITTAPI, void, counter_dec,          (__itt_counter id),                                                                                  (ITT_FORMAT id),        counter_dec,       __itt_group_counter, "%p")
+ITT_STUBV(ITTAPI, void, counter_dec_delta,    (__itt_counter id, unsigned long long value),                                                        (ITT_FORMAT id, value), counter_dec_delta, __itt_group_counter, "%p, %lu")
+ITT_STUBV(ITTAPI, void, counter_set_value,    (__itt_counter id, void *value_ptr),                                                                 (ITT_FORMAT id, value_ptr),                          counter_set_value,    __itt_group_counter, "%p, %p")
+ITT_STUBV(ITTAPI, void, counter_set_value_ex, (__itt_counter id, __itt_clock_domain *clock_domain, unsigned long long timestamp, void *value_ptr), (ITT_FORMAT id, clock_domain, timestamp, value_ptr), counter_set_value_ex, __itt_group_counter, "%p, %p, %llu, %p")
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_mark_type, mark_createA, (const char    *name), (ITT_FORMAT name), mark_createA, __itt_group_mark, "\"%s\"")
+ITT_STUB(ITTAPI, __itt_mark_type, mark_createW, (const wchar_t *name), (ITT_FORMAT name), mark_createW, __itt_group_mark, "\"%S\"")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_mark_type, mark_create,  (const char    *name), (ITT_FORMAT name), mark_create,  __itt_group_mark, "\"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* __ITT_INTERNAL_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int,  markA,        (__itt_mark_type mt, const char    *parameter), (ITT_FORMAT mt, parameter), markA, __itt_group_mark, "%d, \"%s\"")
+ITT_STUB(ITTAPI, int,  markW,        (__itt_mark_type mt, const wchar_t *parameter), (ITT_FORMAT mt, parameter), markW, __itt_group_mark, "%d, \"%S\"")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int,  mark,         (__itt_mark_type mt, const char    *parameter), (ITT_FORMAT mt, parameter), mark,  __itt_group_mark, "%d, \"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int,  mark_off, (__itt_mark_type mt), (ITT_FORMAT mt), mark_off, __itt_group_mark, "%d")
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int,  mark_globalA, (__itt_mark_type mt, const char    *parameter), (ITT_FORMAT mt, parameter), mark_globalA, __itt_group_mark, "%d, \"%s\"")
+ITT_STUB(ITTAPI, int,  mark_globalW, (__itt_mark_type mt, const wchar_t *parameter), (ITT_FORMAT mt, parameter), mark_globalW, __itt_group_mark, "%d, \"%S\"")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int,  mark_global,  (__itt_mark_type mt, const char    *parameter), (ITT_FORMAT mt, parameter), mark_global,  __itt_group_mark, "%d, \"%S\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int,  mark_global_off, (__itt_mark_type mt),                        (ITT_FORMAT mt),            mark_global_off, __itt_group_mark, "%d")
+
+#ifndef __ITT_INTERNAL_BODY
+ITT_STUB(ITTAPI, __itt_caller, stack_caller_create, (void), (ITT_NO_PARAMS), stack_caller_create,  __itt_group_stitch, "no args")
+#endif /* __ITT_INTERNAL_BODY */
+ITT_STUBV(ITTAPI, void, stack_caller_destroy, (__itt_caller id), (ITT_FORMAT id), stack_caller_destroy, __itt_group_stitch, "%p")
+ITT_STUBV(ITTAPI, void, stack_callee_enter,   (__itt_caller id), (ITT_FORMAT id), stack_callee_enter,   __itt_group_stitch, "%p")
+ITT_STUBV(ITTAPI, void, stack_callee_leave,   (__itt_caller id), (ITT_FORMAT id), stack_callee_leave,   __itt_group_stitch, "%p")
+
+ITT_STUB(ITTAPI,  __itt_clock_domain*, clock_domain_create, (__itt_get_clock_info_fn fn, void* fn_data), (ITT_FORMAT fn, fn_data), clock_domain_create, __itt_group_structure, "%p, %p")
+ITT_STUBV(ITTAPI, void,                clock_domain_reset,  (void),                                      (ITT_NO_PARAMS),          clock_domain_reset,  __itt_group_structure, "no args")
+ITT_STUBV(ITTAPI, void, id_create_ex,  (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id), (ITT_FORMAT domain, clock_domain, timestamp, id), id_create_ex,  __itt_group_structure, "%p, %p, %lu, %lu")
+ITT_STUBV(ITTAPI, void, id_destroy_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id), (ITT_FORMAT domain, clock_domain, timestamp, id), id_destroy_ex, __itt_group_structure, "%p, %p, %lu, %lu")
+ITT_STUBV(ITTAPI, void, task_begin_ex,    (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, __itt_string_handle *name), (ITT_FORMAT domain, clock_domain, timestamp, id, parentid, name), task_begin_ex, __itt_group_structure, "%p, %p, %lu, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_begin_fn_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, void* fn),                  (ITT_FORMAT domain, clock_domain, timestamp, id, parentid, fn), task_begin_fn_ex, __itt_group_structure, "%p, %p, %lu, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_end_ex,      (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp),                                                            (ITT_FORMAT domain, clock_domain, timestamp), task_end_ex, __itt_group_structure, "%p, %p, %lu")
+ITT_STUBV(ITTAPI, void, task_begin_overlapped,       (const __itt_domain *domain, __itt_id id, __itt_id parent, __itt_string_handle *name),                                                                   (ITT_FORMAT domain, id, parent, name), task_begin_overlapped, __itt_group_structure, "%p, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_begin_overlapped_ex,    (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, __itt_string_handle *name), (ITT_FORMAT domain, clock_domain, timestamp, id, parentid, name), task_begin_overlapped_ex, __itt_group_structure, "%p, %p, %lu, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_end_overlapped, (const __itt_domain *domain, __itt_id id),                                                                                                                       (ITT_FORMAT domain, id), task_end_overlapped, __itt_group_structure, "%p, %lu")
+ITT_STUBV(ITTAPI, void, task_end_overlapped_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id),                                                    (ITT_FORMAT domain, clock_domain, timestamp, id), task_end_overlapped_ex, __itt_group_structure, "%p, %p, %lu, %lu")
+ITT_STUBV(ITTAPI, void, marker_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_string_handle *name, __itt_scope scope), (ITT_FORMAT domain, clock_domain, timestamp, id, name, scope), marker_ex, __itt_group_structure, "%p, %p, %lu, %lu, %p, %d")
+ITT_STUBV(ITTAPI, void, metadata_add_with_scope, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data), (ITT_FORMAT domain, scope, key, type, count, data), metadata_add_with_scope, __itt_group_structure, "%p, %d, %p, %d, %lu, %p")
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeA, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length),    (ITT_FORMAT domain, scope, key, data, length), metadata_str_add_with_scopeA, __itt_group_structure, "%p, %d, %p, %p, %lu")
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeW, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const wchar_t *data, size_t length), (ITT_FORMAT domain, scope, key, data, length), metadata_str_add_with_scopeW, __itt_group_structure, "%p, %d, %p, %p, %lu")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scope,  (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length),    (ITT_FORMAT domain, scope, key, data, length), metadata_str_add_with_scope,  __itt_group_structure, "%p, %d, %p, %p, %lu")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, relation_add_to_current_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_relation relation, __itt_id tail),                (ITT_FORMAT domain, clock_domain, timestamp, relation, tail),       relation_add_to_current_ex, __itt_group_structure, "%p, %p, %lu, %d, %lu")
+ITT_STUBV(ITTAPI, void, relation_add_ex,            (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id head, __itt_relation relation, __itt_id tail), (ITT_FORMAT domain, clock_domain, timestamp, head, relation, tail), relation_add_ex,            __itt_group_structure, "%p, %p, %lu, %lu, %d, %lu")
+ITT_STUB(ITTAPI,  __itt_track_group*, track_group_create, (__itt_string_handle* name, __itt_track_group_type track_group_type),                    (ITT_FORMAT name, track_group_type),        track_group_create, __itt_group_structure, "%p, %d")
+ITT_STUB(ITTAPI,  __itt_track*,       track_create,       (__itt_track_group* track_group,__itt_string_handle* name, __itt_track_type track_type), (ITT_FORMAT track_group, name, track_type), track_create,       __itt_group_structure, "%p, %p, %d")
+ITT_STUBV(ITTAPI, void,               set_track,          (__itt_track *track),                                                                    (ITT_FORMAT track),                         set_track,          __itt_group_structure, "%p")
+
+#ifndef __ITT_INTERNAL_BODY
+ITT_STUB(ITTAPI, const char*, api_version, (void), (ITT_NO_PARAMS), api_version, __itt_group_all & ~__itt_group_legacy, "no args")
+#endif /* __ITT_INTERNAL_BODY */
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, av_saveA, (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder), (ITT_FORMAT data, rank, dimensions, type, filePath, columnOrder), av_saveA, __itt_group_arrays, "%p, %d, %p, %d, \"%s\", %d")
+ITT_STUB(ITTAPI, int, av_saveW, (void *data, int rank, const int *dimensions, int type, const wchar_t *filePath, int columnOrder), (ITT_FORMAT data, rank, dimensions, type, filePath, columnOrder), av_saveW, __itt_group_arrays, "%p, %d, %p, %d, \"%S\", %d")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, av_save,  (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder), (ITT_FORMAT data, rank, dimensions, type, filePath, columnOrder), av_save,  __itt_group_arrays, "%p, %d, %p, %d, \"%s\", %d")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* __ITT_INTERNAL_BODY */
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, module_loadA, (void *start_addr, void* end_addr, const char *path), (ITT_FORMAT start_addr, end_addr, path), module_loadA, __itt_group_none, "%p, %p, %p")
+ITT_STUBV(ITTAPI, void, module_loadW, (void *start_addr, void* end_addr, const wchar_t *path), (ITT_FORMAT start_addr, end_addr, path), module_loadW, __itt_group_none, "%p, %p, %p")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, module_load, (void *start_addr, void *end_addr, const char *path), (ITT_FORMAT start_addr, end_addr, path), module_load, __itt_group_none, "%p, %p, %p")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* __ITT_INTERNAL_BODY */
+
+
+#endif /* __ITT_INTERNAL_INIT */
diff --git a/contrib/libs/tbb/src/tbb/tools_api/ittnotify_types.h b/contrib/libs/tbb/src/tbb/tools_api/ittnotify_types.h
new file mode 100644
index 0000000000..3849452c27
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/tools_api/ittnotify_types.h
@@ -0,0 +1,73 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _ITTNOTIFY_TYPES_H_
+#define _ITTNOTIFY_TYPES_H_
+
+typedef enum ___itt_group_id
+{
+    __itt_group_none      = 0,
+    __itt_group_legacy    = 1<<0,
+    __itt_group_control   = 1<<1,
+    __itt_group_thread    = 1<<2,
+    __itt_group_mark      = 1<<3,
+    __itt_group_sync      = 1<<4,
+    __itt_group_fsync     = 1<<5,
+    __itt_group_jit       = 1<<6,
+    __itt_group_model     = 1<<7,
+    __itt_group_splitter_min = 1<<7,
+    __itt_group_counter   = 1<<8,
+    __itt_group_frame     = 1<<9,
+    __itt_group_stitch    = 1<<10,
+    __itt_group_heap      = 1<<11,
+    __itt_group_splitter_max = 1<<12,
+    __itt_group_structure = 1<<12,
+    __itt_group_suppress = 1<<13,
+    __itt_group_arrays    = 1<<14,
+    __itt_group_all       = -1
+} __itt_group_id;
+
+#pragma pack(push, 8)
+
+typedef struct ___itt_group_list
+{
+    __itt_group_id id;
+    const char*    name;
+} __itt_group_list;
+
+#pragma pack(pop)
+
+#define ITT_GROUP_LIST(varname) \
+    static __itt_group_list varname[] = {       \
+        { __itt_group_all,       "all"       }, \
+        { __itt_group_control,   "control"   }, \
+        { __itt_group_thread,    "thread"    }, \
+        { __itt_group_mark,      "mark"      }, \
+        { __itt_group_sync,      "sync"      }, \
+        { __itt_group_fsync,     "fsync"     }, \
+        { __itt_group_jit,       "jit"       }, \
+        { __itt_group_model,     "model"     }, \
+        { __itt_group_counter,   "counter"   }, \
+        { __itt_group_frame,     "frame"     }, \
+        { __itt_group_stitch,    "stitch"    }, \
+        { __itt_group_heap,      "heap"      }, \
+        { __itt_group_structure, "structure" }, \
+        { __itt_group_suppress,  "suppress"  }, \
+        { __itt_group_arrays,    "arrays"    }, \
+        { __itt_group_none,      NULL        }  \
+    }
+
+#endif /* _ITTNOTIFY_TYPES_H_ */
diff --git a/contrib/libs/tbb/src/tbb/tools_api/legacy/ittnotify.h b/contrib/libs/tbb/src/tbb/tools_api/legacy/ittnotify.h
new file mode 100644
index 0000000000..b05a199d1f
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/tools_api/legacy/ittnotify.h
@@ -0,0 +1,998 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _LEGACY_ITTNOTIFY_H_
+#define _LEGACY_ITTNOTIFY_H_
+
+/**
+ * @file
+ * @brief Legacy User API functions and types
+ */
+
+/** @cond exclude_from_documentation */
+#ifndef ITT_OS_WIN
+#  define ITT_OS_WIN   1
+#endif /* ITT_OS_WIN */
+
+#ifndef ITT_OS_LINUX
+#  define ITT_OS_LINUX 2
+#endif /* ITT_OS_LINUX */
+
+#ifndef ITT_OS_MAC
+#  define ITT_OS_MAC   3
+#endif /* ITT_OS_MAC */
+
+#ifndef ITT_OS_FREEBSD
+#  define ITT_OS_FREEBSD   4
+#endif /* ITT_OS_FREEBSD */
+
+#ifndef ITT_OS
+#  if defined WIN32 || defined _WIN32
+#    define ITT_OS ITT_OS_WIN
+#  elif defined( __APPLE__ ) && defined( __MACH__ )
+#    define ITT_OS ITT_OS_MAC
+#  elif defined( __FreeBSD__ )
+#    define ITT_OS ITT_OS_FREEBSD
+#  else
+#    define ITT_OS ITT_OS_LINUX
+#  endif
+#endif /* ITT_OS */
+
+#ifndef ITT_PLATFORM_WIN
+#  define ITT_PLATFORM_WIN 1
+#endif /* ITT_PLATFORM_WIN */
+
+#ifndef ITT_PLATFORM_POSIX
+#  define ITT_PLATFORM_POSIX 2
+#endif /* ITT_PLATFORM_POSIX */
+
+#ifndef ITT_PLATFORM_MAC
+#  define ITT_PLATFORM_MAC 3
+#endif /* ITT_PLATFORM_MAC */
+
+#ifndef ITT_PLATFORM_FREEBSD
+#  define ITT_PLATFORM_FREEBSD 4
+#endif /* ITT_PLATFORM_FREEBSD */
+
+#ifndef ITT_PLATFORM
+#  if ITT_OS==ITT_OS_WIN
+#    define ITT_PLATFORM ITT_PLATFORM_WIN
+#  elif ITT_OS==ITT_OS_MAC
+#    define ITT_PLATFORM ITT_PLATFORM_MAC
+#  elif ITT_OS==ITT_OS_FREEBSD
+#    define ITT_PLATFORM ITT_PLATFORM_FREEBSD
+#  else
+#    define ITT_PLATFORM ITT_PLATFORM_POSIX
+#  endif
+#endif /* ITT_PLATFORM */
+
+#if defined(_UNICODE) && !defined(UNICODE)
+#define UNICODE
+#endif
+
+#include <stddef.h>
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#include <tchar.h>
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <stdint.h>
+#if defined(UNICODE) || defined(_UNICODE)
+#include <wchar.h>
+#endif /* UNICODE || _UNICODE */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef ITTAPI_CDECL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define ITTAPI_CDECL __cdecl
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__ 
+#      define ITTAPI_CDECL __attribute__ ((cdecl))
+#    else  /* _M_IX86 || __i386__ */
+#      define ITTAPI_CDECL /* actual only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* ITTAPI_CDECL */
+
+#ifndef STDCALL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define STDCALL __stdcall
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__
+#      define STDCALL __attribute__ ((stdcall)) 
+#    else  /* _M_IX86 || __i386__ */
+#      define STDCALL /* supported only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* STDCALL */
+
+#define ITTAPI    ITTAPI_CDECL
+#define LIBITTAPI ITTAPI_CDECL
+
+/* TODO: Temporary for compatibility! */
+#define ITTAPI_CALL    ITTAPI_CDECL
+#define LIBITTAPI_CALL ITTAPI_CDECL
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+/* use __forceinline (VC++ specific) */
+#define ITT_INLINE           __forceinline
+#define ITT_INLINE_ATTRIBUTE /* nothing */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/*
+ * Generally, functions are not inlined unless optimization is specified.
+ * For functions declared inline, this attribute inlines the function even
+ * if no optimization level was specified.
+ */
+#ifdef __STRICT_ANSI__
+#define ITT_INLINE           static
+#define ITT_INLINE_ATTRIBUTE __attribute__((unused))
+#else  /* __STRICT_ANSI__ */
+#define ITT_INLINE           static inline
+#define ITT_INLINE_ATTRIBUTE __attribute__((always_inline, unused))
+#endif /* __STRICT_ANSI__ */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+/* Helper macro for joining tokens */
+#define ITT_JOIN_AUX(p,n) p##n
+#define ITT_JOIN(p,n)     ITT_JOIN_AUX(p,n)
+
+#ifdef ITT_MAJOR
+#undef ITT_MAJOR
+#endif
+#ifdef ITT_MINOR
+#undef ITT_MINOR
+#endif
+#define ITT_MAJOR     3
+#define ITT_MINOR     0
+
+/* Standard versioning of a token with major and minor version numbers */
+#define ITT_VERSIONIZE(x)    \
+    ITT_JOIN(x,              \
+    ITT_JOIN(_,              \
+    ITT_JOIN(ITT_MAJOR,      \
+    ITT_JOIN(_, ITT_MINOR))))
+
+#ifndef INTEL_ITTNOTIFY_PREFIX
+#  define INTEL_ITTNOTIFY_PREFIX __itt_
+#endif /* INTEL_ITTNOTIFY_PREFIX */
+#ifndef INTEL_ITTNOTIFY_POSTFIX
+#  define INTEL_ITTNOTIFY_POSTFIX _ptr_
+#endif /* INTEL_ITTNOTIFY_POSTFIX */
+
+#define ITTNOTIFY_NAME_AUX(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX,n)
+#define ITTNOTIFY_NAME(n)     ITT_VERSIONIZE(ITTNOTIFY_NAME_AUX(ITT_JOIN(n,INTEL_ITTNOTIFY_POSTFIX)))
+
+#define ITTNOTIFY_VOID(n) (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)
+#define ITTNOTIFY_DATA(n) (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)
+
+#define ITTNOTIFY_VOID_D0(n,d)       (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_VOID_D1(n,d,x)     (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x)
+#define ITTNOTIFY_VOID_D2(n,d,x,y)   (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y)
+#define ITTNOTIFY_VOID_D3(n,d,x,y,z) (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z)
+#define ITTNOTIFY_VOID_D4(n,d,x,y,z,a)     (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
+#define ITTNOTIFY_VOID_D5(n,d,x,y,z,a,b)   (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
+#define ITTNOTIFY_VOID_D6(n,d,x,y,z,a,b,c) (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+#define ITTNOTIFY_DATA_D0(n,d)       (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_DATA_D1(n,d,x)     (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x)
+#define ITTNOTIFY_DATA_D2(n,d,x,y)   (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y)
+#define ITTNOTIFY_DATA_D3(n,d,x,y,z) (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z)
+#define ITTNOTIFY_DATA_D4(n,d,x,y,z,a)     (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
+#define ITTNOTIFY_DATA_D5(n,d,x,y,z,a,b)   (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
+#define ITTNOTIFY_DATA_D6(n,d,x,y,z,a,b,c) (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+
+#ifdef ITT_STUB
+#undef ITT_STUB
+#endif
+#ifdef ITT_STUBV
+#undef ITT_STUBV
+#endif
+#define ITT_STUBV(api,type,name,args)                             \
+    typedef type (api* ITT_JOIN(ITTNOTIFY_NAME(name),_t)) args;   \
+    extern ITT_JOIN(ITTNOTIFY_NAME(name),_t) ITTNOTIFY_NAME(name);
+#define ITT_STUB ITT_STUBV
+/** @endcond */
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * @defgroup legacy Legacy API
+ * @{
+ * @}
+ */
+
+/**
+ * @defgroup legacy_control Collection Control
+ * @ingroup legacy
+ * General behavior: application continues to run, but no profiling information is being collected
+ *
+ * Pausing occurs not only for the current thread but for all process as well as spawned processes
+ * - Intel(R) Parallel Inspector and Intel(R) Inspector XE:
+ *   - Does not analyze or report errors that involve memory access.
+ *   - Other errors are reported as usual. Pausing data collection in
+ *     Intel(R) Parallel Inspector and Intel(R) Inspector XE
+ *     only pauses tracing and analyzing memory access.
+ *     It does not pause tracing or analyzing threading APIs.
+ *   .
+ * - Intel(R) Parallel Amplifier and Intel(R) VTune(TM) Amplifier XE:
+ *   - Does continue to record when new threads are started.
+ *   .
+ * - Other effects:
+ *   - Possible reduction of runtime overhead.
+ *   .
+ * @{
+ */
+#ifndef _ITTNOTIFY_H_
+/** @brief Pause collection */
+void ITTAPI __itt_pause(void);
+/** @brief Resume collection */
+void ITTAPI __itt_resume(void);
+/** @brief Detach collection */
+void ITTAPI __itt_detach(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, pause,   (void))
+ITT_STUBV(ITTAPI, void, resume,  (void))
+ITT_STUBV(ITTAPI, void, detach,  (void))
+#define __itt_pause      ITTNOTIFY_VOID(pause)
+#define __itt_pause_ptr  ITTNOTIFY_NAME(pause)
+#define __itt_resume     ITTNOTIFY_VOID(resume)
+#define __itt_resume_ptr ITTNOTIFY_NAME(resume)
+#define __itt_detach     ITTNOTIFY_VOID(detach)
+#define __itt_detach_ptr ITTNOTIFY_NAME(detach)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_pause()
+#define __itt_pause_ptr  0
+#define __itt_resume()
+#define __itt_resume_ptr 0
+#define __itt_detach()
+#define __itt_detach_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_pause_ptr  0
+#define __itt_resume_ptr 0
+#define __itt_detach_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+#endif /* _ITTNOTIFY_H_ */
+/** @} legacy_control group */
+
+/**
+ * @defgroup legacy_threads Threads
+ * @ingroup legacy
+ * Threads group
+ * @warning Legacy API
+ * @{
+ */
+/**
+ * @deprecated Legacy API
+ * @brief Set name to be associated with thread in analysis GUI.
+ * @return __itt_err upon failure (name or namelen being null,name and namelen mismatched)
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+int LIBITTAPI __itt_thr_name_setA(const char    *name, int namelen);
+int LIBITTAPI __itt_thr_name_setW(const wchar_t *name, int namelen);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_thr_name_set     __itt_thr_name_setW
+#  define __itt_thr_name_set_ptr __itt_thr_name_setW_ptr
+#else
+#  define __itt_thr_name_set     __itt_thr_name_setA
+#  define __itt_thr_name_set_ptr __itt_thr_name_setA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int LIBITTAPI __itt_thr_name_set(const char *name, int namelen);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, int, thr_name_setA, (const char    *name, int namelen))
+ITT_STUB(LIBITTAPI, int, thr_name_setW, (const wchar_t *name, int namelen))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, int, thr_name_set,  (const char    *name, int namelen))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thr_name_setA     ITTNOTIFY_DATA(thr_name_setA)
+#define __itt_thr_name_setA_ptr ITTNOTIFY_NAME(thr_name_setA)
+#define __itt_thr_name_setW     ITTNOTIFY_DATA(thr_name_setW)
+#define __itt_thr_name_setW_ptr ITTNOTIFY_NAME(thr_name_setW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thr_name_set     ITTNOTIFY_DATA(thr_name_set)
+#define __itt_thr_name_set_ptr ITTNOTIFY_NAME(thr_name_set)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thr_name_setA(name, namelen)
+#define __itt_thr_name_setA_ptr 0
+#define __itt_thr_name_setW(name, namelen)
+#define __itt_thr_name_setW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thr_name_set(name, namelen)
+#define __itt_thr_name_set_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thr_name_setA_ptr 0
+#define __itt_thr_name_setW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thr_name_set_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Mark current thread as ignored from this point on, for the duration of its existence.
+ */
+void LIBITTAPI __itt_thr_ignore(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, thr_ignore, (void))
+#define __itt_thr_ignore     ITTNOTIFY_VOID(thr_ignore)
+#define __itt_thr_ignore_ptr ITTNOTIFY_NAME(thr_ignore)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_thr_ignore()
+#define __itt_thr_ignore_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_thr_ignore_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} legacy_threads group */
+
+/**
+ * @defgroup legacy_sync Synchronization
+ * @ingroup legacy
+ * Synchronization group
+ * @warning Legacy API
+ * @{
+ */
+/**
+ * @hideinitializer
+ * @brief possible value of attribute argument for sync object type
+ */
+#define __itt_attr_barrier 1
+
+/**
+ * @hideinitializer
+ * @brief possible value of attribute argument for sync object type
+ */
+#define __itt_attr_mutex   2
+
+/**
+ * @deprecated Legacy API
+ * @brief Assign a name to a sync object using char or Unicode string
+ * @param[in] addr    - pointer to the sync object. You should use a real pointer to your object
+ *                      to make sure that the values don't clash with other object addresses
+ * @param[in] objtype - null-terminated object type string. If NULL is passed, the object will
+ *                      be assumed to be of generic "User Synchronization" type
+ * @param[in] objname - null-terminated object name string. If NULL, no name will be assigned
+ *                      to the object -- you can use the __itt_sync_rename call later to assign
+ *                      the name
+ * @param[in] attribute - one of [#__itt_attr_barrier, #__itt_attr_mutex] values which defines the
+ *                      exact semantics of how prepare/acquired/releasing calls work.
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_sync_set_nameA(void *addr, const char    *objtype, const char    *objname, int attribute);
+void ITTAPI __itt_sync_set_nameW(void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_sync_set_name     __itt_sync_set_nameW
+#  define __itt_sync_set_name_ptr __itt_sync_set_nameW_ptr
+#else /* UNICODE */
+#  define __itt_sync_set_name     __itt_sync_set_nameA
+#  define __itt_sync_set_name_ptr __itt_sync_set_nameA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_sync_set_name(void *addr, const char* objtype, const char* objname, int attribute);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_set_nameA, (void *addr, const char    *objtype, const char    *objname, int attribute))
+ITT_STUBV(ITTAPI, void, sync_set_nameW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_set_name,  (void *addr, const char    *objtype, const char    *objname, int attribute))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_set_nameA     ITTNOTIFY_VOID(sync_set_nameA)
+#define __itt_sync_set_nameA_ptr ITTNOTIFY_NAME(sync_set_nameA)
+#define __itt_sync_set_nameW     ITTNOTIFY_VOID(sync_set_nameW)
+#define __itt_sync_set_nameW_ptr ITTNOTIFY_NAME(sync_set_nameW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_set_name     ITTNOTIFY_VOID(sync_set_name)
+#define __itt_sync_set_name_ptr ITTNOTIFY_NAME(sync_set_name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_set_nameA(addr, objtype, objname, attribute)
+#define __itt_sync_set_nameA_ptr 0
+#define __itt_sync_set_nameW(addr, objtype, objname, attribute)
+#define __itt_sync_set_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_set_name(addr, objtype, objname, attribute)
+#define __itt_sync_set_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_set_nameA_ptr 0
+#define __itt_sync_set_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_set_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Assign a name and type to a sync object using char or Unicode string
+ * @param[in] addr -      pointer to the sync object. You should use a real pointer to your object
+ *                        to make sure that the values don't clash with other object addresses
+ * @param[in] objtype -   null-terminated object type string. If NULL is passed, the object will
+ *                        be assumed to be of generic "User Synchronization" type
+ * @param[in] objname -   null-terminated object name string. If NULL, no name will be assigned
+ *                        to the object -- you can use the __itt_sync_rename call later to assign
+ *                        the name
+ * @param[in] typelen, namelen -   a length of string for appropriate objtype and objname parameter
+ * @param[in] attribute - one of [#__itt_attr_barrier, #__itt_attr_mutex] values which defines the
+ *                        exact semantics of how prepare/acquired/releasing calls work.
+ * @return __itt_err upon failure (name or namelen being null,name and namelen mismatched)
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+int LIBITTAPI __itt_notify_sync_nameA(void *addr, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute);
+int LIBITTAPI __itt_notify_sync_nameW(void *addr, const wchar_t *objtype, int typelen, const wchar_t *objname, int namelen, int attribute);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_notify_sync_name __itt_notify_sync_nameW
+#else
+#  define __itt_notify_sync_name __itt_notify_sync_nameA
+#endif
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int LIBITTAPI __itt_notify_sync_name(void *addr, const char *objtype, int typelen, const char *objname, int namelen, int attribute);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, int, notify_sync_nameA, (void *addr, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute))
+ITT_STUB(LIBITTAPI, int, notify_sync_nameW, (void *addr, const wchar_t *objtype, int typelen, const wchar_t *objname, int namelen, int attribute))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, int, notify_sync_name,  (void *addr, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_notify_sync_nameA     ITTNOTIFY_DATA(notify_sync_nameA)
+#define __itt_notify_sync_nameA_ptr ITTNOTIFY_NAME(notify_sync_nameA)
+#define __itt_notify_sync_nameW     ITTNOTIFY_DATA(notify_sync_nameW)
+#define __itt_notify_sync_nameW_ptr ITTNOTIFY_NAME(notify_sync_nameW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_notify_sync_name     ITTNOTIFY_DATA(notify_sync_name)
+#define __itt_notify_sync_name_ptr ITTNOTIFY_NAME(notify_sync_name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_notify_sync_nameA(addr, objtype, typelen, objname, namelen, attribute)
+#define __itt_notify_sync_nameA_ptr 0
+#define __itt_notify_sync_nameW(addr, objtype, typelen, objname, namelen, attribute)
+#define __itt_notify_sync_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_notify_sync_name(addr, objtype, typelen, objname, namelen, attribute)
+#define __itt_notify_sync_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_notify_sync_nameA_ptr 0
+#define __itt_notify_sync_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_notify_sync_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Enter spin loop on user-defined sync object
+ */
+void LIBITTAPI __itt_notify_sync_prepare(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, notify_sync_prepare, (void *addr))
+#define __itt_notify_sync_prepare     ITTNOTIFY_VOID(notify_sync_prepare)
+#define __itt_notify_sync_prepare_ptr ITTNOTIFY_NAME(notify_sync_prepare)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_notify_sync_prepare(addr)
+#define __itt_notify_sync_prepare_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_notify_sync_prepare_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Quit spin loop without acquiring spin object
+ */
+void LIBITTAPI __itt_notify_sync_cancel(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, notify_sync_cancel, (void *addr))
+#define __itt_notify_sync_cancel     ITTNOTIFY_VOID(notify_sync_cancel)
+#define __itt_notify_sync_cancel_ptr ITTNOTIFY_NAME(notify_sync_cancel)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_notify_sync_cancel(addr)
+#define __itt_notify_sync_cancel_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_notify_sync_cancel_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Successful spin loop completion (sync object acquired)
+ */
+void LIBITTAPI __itt_notify_sync_acquired(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, notify_sync_acquired, (void *addr))
+#define __itt_notify_sync_acquired     ITTNOTIFY_VOID(notify_sync_acquired)
+#define __itt_notify_sync_acquired_ptr ITTNOTIFY_NAME(notify_sync_acquired)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_notify_sync_acquired(addr)
+#define __itt_notify_sync_acquired_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_notify_sync_acquired_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Start sync object releasing code. Is called before the lock release call.
+ */
+void LIBITTAPI __itt_notify_sync_releasing(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, notify_sync_releasing, (void *addr))
+#define __itt_notify_sync_releasing     ITTNOTIFY_VOID(notify_sync_releasing)
+#define __itt_notify_sync_releasing_ptr ITTNOTIFY_NAME(notify_sync_releasing)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_notify_sync_releasing(addr)
+#define __itt_notify_sync_releasing_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_notify_sync_releasing_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} legacy_sync group */
+
+#ifndef _ITTNOTIFY_H_
+/**
+ * @defgroup legacy_events Events
+ * @ingroup legacy
+ * Events group
+ * @{
+ */
+
+/** @brief user event type */
+typedef int __itt_event;
+
+/**
+ * @brief Create an event notification
+ * @note name or namelen being null/name and namelen not matching, user event feature not enabled
+ * @return non-zero event identifier upon success and __itt_err otherwise
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_event LIBITTAPI __itt_event_createA(const char    *name, int namelen);
+__itt_event LIBITTAPI __itt_event_createW(const wchar_t *name, int namelen);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_event_create     __itt_event_createW
+#  define __itt_event_create_ptr __itt_event_createW_ptr
+#else
+#  define __itt_event_create     __itt_event_createA
+#  define __itt_event_create_ptr __itt_event_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_event LIBITTAPI __itt_event_create(const char *name, int namelen);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, __itt_event, event_createA, (const char    *name, int namelen))
+ITT_STUB(LIBITTAPI, __itt_event, event_createW, (const wchar_t *name, int namelen))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, __itt_event, event_create,  (const char *name, int namelen))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA     ITTNOTIFY_DATA(event_createA)
+#define __itt_event_createA_ptr ITTNOTIFY_NAME(event_createA)
+#define __itt_event_createW     ITTNOTIFY_DATA(event_createW)
+#define __itt_event_createW_ptr ITTNOTIFY_NAME(event_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create      ITTNOTIFY_DATA(event_create)
+#define __itt_event_create_ptr  ITTNOTIFY_NAME(event_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA(name, namelen) (__itt_event)0
+#define __itt_event_createA_ptr 0
+#define __itt_event_createW(name, namelen) (__itt_event)0
+#define __itt_event_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create(name, namelen)  (__itt_event)0
+#define __itt_event_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA_ptr 0
+#define __itt_event_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an event occurrence.
+ * @return __itt_err upon failure (invalid event id/user event feature not enabled)
+ */
+int LIBITTAPI __itt_event_start(__itt_event event);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(LIBITTAPI, int, event_start, (__itt_event event))
+#define __itt_event_start     ITTNOTIFY_DATA(event_start)
+#define __itt_event_start_ptr ITTNOTIFY_NAME(event_start)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_event_start(event) (int)0
+#define __itt_event_start_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_event_start_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an event end occurrence.
+ * @note It is optional if events do not have durations.
+ * @return __itt_err upon failure (invalid event id/user event feature not enabled)
+ */
+int LIBITTAPI __itt_event_end(__itt_event event);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(LIBITTAPI, int, event_end, (__itt_event event))
+#define __itt_event_end     ITTNOTIFY_DATA(event_end)
+#define __itt_event_end_ptr ITTNOTIFY_NAME(event_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_event_end(event) (int)0
+#define __itt_event_end_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_event_end_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} legacy_events group */
+#endif /* _ITTNOTIFY_H_ */
+
+/**
+ * @defgroup legacy_memory Memory Accesses
+ * @ingroup legacy
+ */
+
+/**
+ * @deprecated Legacy API
+ * @brief Inform the tool of memory accesses on reading
+ */
+void LIBITTAPI __itt_memory_read(void *addr, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, memory_read, (void *addr, size_t size))
+#define __itt_memory_read     ITTNOTIFY_VOID(memory_read)
+#define __itt_memory_read_ptr ITTNOTIFY_NAME(memory_read)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_memory_read(addr, size)
+#define __itt_memory_read_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_memory_read_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Inform the tool of memory accesses on writing
+ */
+void LIBITTAPI __itt_memory_write(void *addr, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, memory_write, (void *addr, size_t size))
+#define __itt_memory_write     ITTNOTIFY_VOID(memory_write)
+#define __itt_memory_write_ptr ITTNOTIFY_NAME(memory_write)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_memory_write(addr, size)
+#define __itt_memory_write_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_memory_write_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Inform the tool of memory accesses on updating
+ */
+void LIBITTAPI __itt_memory_update(void *address, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, memory_update, (void *addr, size_t size))
+#define __itt_memory_update     ITTNOTIFY_VOID(memory_update)
+#define __itt_memory_update_ptr ITTNOTIFY_NAME(memory_update)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_memory_update(addr, size)
+#define __itt_memory_update_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_memory_update_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} legacy_memory group */
+
+/**
+ * @defgroup legacy_state Thread and Object States
+ * @ingroup legacy
+ */
+
+/** @brief state type */
+typedef int __itt_state_t;
+
+/** @cond exclude_from_documentation */
+typedef enum __itt_obj_state {
+    __itt_obj_state_err = 0,
+    __itt_obj_state_clr = 1,
+    __itt_obj_state_set = 2,
+    __itt_obj_state_use = 3
+} __itt_obj_state_t;
+
+typedef enum __itt_thr_state {
+    __itt_thr_state_err = 0,
+    __itt_thr_state_clr = 1,
+    __itt_thr_state_set = 2
+} __itt_thr_state_t;
+
+typedef enum __itt_obj_prop {
+    __itt_obj_prop_watch    = 1,
+    __itt_obj_prop_ignore   = 2,
+    __itt_obj_prop_sharable = 3
+} __itt_obj_prop_t;
+
+typedef enum __itt_thr_prop {
+    __itt_thr_prop_quiet = 1
+} __itt_thr_prop_t;
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief managing thread and object states
+ */
+__itt_state_t LIBITTAPI __itt_state_get(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_state_t, state_get, (void))
+#define __itt_state_get     ITTNOTIFY_DATA(state_get)
+#define __itt_state_get_ptr ITTNOTIFY_NAME(state_get)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_state_get(void) (__itt_state_t)0
+#define __itt_state_get_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_state_get_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief managing thread and object states
+ */
+__itt_state_t LIBITTAPI __itt_state_set(__itt_state_t s);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_state_t, state_set, (__itt_state_t s))
+#define __itt_state_set     ITTNOTIFY_DATA(state_set)
+#define __itt_state_set_ptr ITTNOTIFY_NAME(state_set)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_state_set(s) (__itt_state_t)0
+#define __itt_state_set_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_state_set_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief managing thread and object modes
+ */
+__itt_thr_state_t LIBITTAPI __itt_thr_mode_set(__itt_thr_prop_t p, __itt_thr_state_t s);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_thr_state_t, thr_mode_set, (__itt_thr_prop_t p, __itt_thr_state_t s))
+#define __itt_thr_mode_set     ITTNOTIFY_DATA(thr_mode_set)
+#define __itt_thr_mode_set_ptr ITTNOTIFY_NAME(thr_mode_set)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_thr_mode_set(p, s) (__itt_thr_state_t)0
+#define __itt_thr_mode_set_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_thr_mode_set_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief managing thread and object modes
+ */
+__itt_obj_state_t LIBITTAPI __itt_obj_mode_set(__itt_obj_prop_t p, __itt_obj_state_t s);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_obj_state_t, obj_mode_set, (__itt_obj_prop_t p, __itt_obj_state_t s))
+#define __itt_obj_mode_set     ITTNOTIFY_DATA(obj_mode_set)
+#define __itt_obj_mode_set_ptr ITTNOTIFY_NAME(obj_mode_set)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_obj_mode_set(p, s) (__itt_obj_state_t)0
+#define __itt_obj_mode_set_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_obj_mode_set_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} legacy_state group */
+
+/**
+ * @defgroup frames Frames
+ * @ingroup legacy
+ * Frames group
+ * @{
+ */
+/**
+ * @brief opaque structure for frame identification
+ */
+typedef struct __itt_frame_t *__itt_frame;
+
+/**
+ * @brief Create a global frame with given domain
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_frame ITTAPI __itt_frame_createA(const char    *domain);
+__itt_frame ITTAPI __itt_frame_createW(const wchar_t *domain);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_frame_create     __itt_frame_createW
+#  define __itt_frame_create_ptr __itt_frame_createW_ptr
+#else /* UNICODE */
+#  define __itt_frame_create     __itt_frame_createA
+#  define __itt_frame_create_ptr __itt_frame_createA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_frame ITTAPI __itt_frame_create(const char *domain);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_frame, frame_createA, (const char    *domain))
+ITT_STUB(ITTAPI, __itt_frame, frame_createW, (const wchar_t *domain))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_frame, frame_create,  (const char *domain))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_frame_createA     ITTNOTIFY_DATA(frame_createA)
+#define __itt_frame_createA_ptr ITTNOTIFY_NAME(frame_createA)
+#define __itt_frame_createW     ITTNOTIFY_DATA(frame_createW)
+#define __itt_frame_createW_ptr ITTNOTIFY_NAME(frame_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_frame_create     ITTNOTIFY_DATA(frame_create)
+#define __itt_frame_create_ptr ITTNOTIFY_NAME(frame_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_frame_createA(domain)
+#define __itt_frame_createA_ptr 0
+#define __itt_frame_createW(domain)
+#define __itt_frame_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_frame_create(domain)
+#define __itt_frame_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_frame_createA_ptr 0
+#define __itt_frame_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_frame_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief Record an frame begin occurrence. */
+void ITTAPI __itt_frame_begin(__itt_frame frame);
+/** @brief Record an frame end occurrence. */
+void ITTAPI __itt_frame_end  (__itt_frame frame);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, frame_begin, (__itt_frame frame))
+ITT_STUBV(ITTAPI, void, frame_end,   (__itt_frame frame))
+#define __itt_frame_begin     ITTNOTIFY_VOID(frame_begin)
+#define __itt_frame_begin_ptr ITTNOTIFY_NAME(frame_begin)
+#define __itt_frame_end       ITTNOTIFY_VOID(frame_end)
+#define __itt_frame_end_ptr   ITTNOTIFY_NAME(frame_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_frame_begin(frame)
+#define __itt_frame_begin_ptr 0
+#define __itt_frame_end(frame)
+#define __itt_frame_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_frame_begin_ptr 0
+#define __itt_frame_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} frames group */
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _LEGACY_ITTNOTIFY_H_ */
diff --git a/contrib/libs/tbb/src/tbb/version.cpp b/contrib/libs/tbb/src/tbb/version.cpp
new file mode 100644
index 0000000000..ca113372f1
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/version.cpp
@@ -0,0 +1,26 @@
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "oneapi/tbb/version.h"
+
+extern "C" int TBB_runtime_interface_version() {
+    return TBB_INTERFACE_VERSION;
+}
+
+extern "C" const char* TBB_runtime_version() {
+    static const char version_str[] = TBB_VERSION_STRING;
+    return version_str;
+}
diff --git a/contrib/libs/tbb/src/tbb/waiters.h b/contrib/libs/tbb/src/tbb/waiters.h
new file mode 100644
index 0000000000..07ee5ab4f0
--- /dev/null
+++ b/contrib/libs/tbb/src/tbb/waiters.h
@@ -0,0 +1,204 @@
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_waiters_H
+#define _TBB_waiters_H
+
+#include "oneapi/tbb/detail/_task.h"
+#include "scheduler_common.h"
+#include "arena.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+inline d1::task* get_self_recall_task(arena_slot& slot);
+
+class waiter_base {
+public:
+    waiter_base(arena& a) : my_arena(a), my_backoff(int(a.my_num_slots)) {}
+
+    bool pause() {
+        if (my_backoff.pause()) {
+            my_arena.is_out_of_work();
+            return true;
+        }
+
+        return false;
+    }
+
+    void reset_wait() {
+        my_backoff.reset_wait();
+    }
+
+protected:
+    arena& my_arena;
+    stealing_loop_backoff my_backoff;
+};
+
+class outermost_worker_waiter : public waiter_base {
+public:
+    using waiter_base::waiter_base;
+
+    bool continue_execution(arena_slot& slot, d1::task*& t) const {
+        __TBB_ASSERT(t == nullptr, nullptr);
+
+        if (is_worker_should_leave(slot)) {
+            // Leave dispatch loop
+            return false;
+        }
+
+        t = get_self_recall_task(slot);
+        return true;
+    }
+
+    void pause(arena_slot&) {
+        waiter_base::pause();
+    }
+
+
+    d1::wait_context* wait_ctx() {
+        return nullptr;
+    }
+
+    static bool postpone_execution(d1::task&) {
+        return false;
+    }
+
+private:
+    using base_type = waiter_base;
+
+    bool is_worker_should_leave(arena_slot& slot) const {
+        bool is_top_priority_arena = my_arena.my_is_top_priority.load(std::memory_order_relaxed);
+        bool is_task_pool_empty = slot.task_pool.load(std::memory_order_relaxed) == EmptyTaskPool;
+
+        if (is_top_priority_arena) {
+            // Worker in most priority arena do not leave arena, until all work in task_pool is done
+            if (is_task_pool_empty && my_arena.is_recall_requested()) {
+                return true;
+            }
+        } else {
+            if (my_arena.is_recall_requested()) {
+                // If worker has work in task pool, we must notify other threads,
+                // because can appear missed wake up of other threads
+                if (!is_task_pool_empty) {
+                    my_arena.advertise_new_work<arena::wakeup>();
+                }
+                return true;
+            }
+        }
+
+        return false;
+    }
+};
+
+class sleep_waiter : public waiter_base {
+protected:
+    using waiter_base::waiter_base;
+
+    bool is_arena_empty() {
+        return my_arena.my_pool_state.load(std::memory_order_relaxed) == arena::SNAPSHOT_EMPTY;
+    }
+
+    template <typename Pred>
+    void sleep(std::uintptr_t uniq_tag, Pred wakeup_condition) {
+        my_arena.my_market->get_wait_list().wait<extended_concurrent_monitor::thread_context>(wakeup_condition,
+            extended_context{uniq_tag, &my_arena});
+    }
+};
+
+class external_waiter : public sleep_waiter {
+public:
+    external_waiter(arena& a, d1::wait_context& wo)
+        : sleep_waiter(a), my_wait_ctx(wo)
+        {}
+
+    bool continue_execution(arena_slot& slot, d1::task*& t) const {
+        __TBB_ASSERT(t == nullptr, nullptr);
+        if (!my_wait_ctx.continue_execution())
+            return false;
+        t = get_self_recall_task(slot);
+        return true;
+    }
+
+    void pause(arena_slot&) {
+        if (!sleep_waiter::pause()) {
+            return;
+        }
+
+        auto wakeup_condition = [&] { return !is_arena_empty() || !my_wait_ctx.continue_execution(); };
+
+        sleep(std::uintptr_t(&my_wait_ctx), wakeup_condition);
+        my_backoff.reset_wait();
+    }
+
+    d1::wait_context* wait_ctx() {
+        return &my_wait_ctx;
+    }
+
+    static bool postpone_execution(d1::task&) {
+        return false;
+    }
+
+private:
+    d1::wait_context& my_wait_ctx;
+};
+
+#if __TBB_RESUMABLE_TASKS
+
+class coroutine_waiter : public sleep_waiter {
+public:
+    using sleep_waiter::sleep_waiter;
+
+    bool continue_execution(arena_slot& slot, d1::task*& t) const {
+        __TBB_ASSERT(t == nullptr, nullptr);
+        t = get_self_recall_task(slot);
+        return true;
+    }
+
+    void pause(arena_slot& slot) {
+        if (!sleep_waiter::pause()) {
+            return;
+        }
+
+        suspend_point_type* sp = slot.default_task_dispatcher().m_suspend_point;
+
+        auto wakeup_condition = [&] { return !is_arena_empty() || sp->m_is_owner_recalled.load(std::memory_order_relaxed); };
+
+        sleep(std::uintptr_t(sp), wakeup_condition);
+        my_backoff.reset_wait();
+    }
+
+    void reset_wait() {
+        my_backoff.reset_wait();
+    }
+
+    d1::wait_context* wait_ctx() {
+        return nullptr;
+    }
+
+    static bool postpone_execution(d1::task& t) {
+        return task_accessor::is_resume_task(t);
+    }
+};
+
+#endif // __TBB_RESUMABLE_TASKS
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif // _TBB_waiters_H
author	Devtools Arcadia <arcadia-devtools@yandex-team.ru>	2022-02-07 18:08:42 +0300
committer	Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>	2022-02-07 18:08:42 +0300
commit	1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
tree	e26c9fed0de5d9873cce7e00bc214573dc2195b7 /contrib/libs/tbb/src
download	ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz