diff options
| author | Devtools Arcadia <[email protected]> | 2022-02-07 18:08:42 +0300 | 
|---|---|---|
| committer | Devtools Arcadia <[email protected]> | 2022-02-07 18:08:42 +0300 | 
| commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
| tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /contrib/libs/llvm12/include/llvm/Support/Parallel.h | |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'contrib/libs/llvm12/include/llvm/Support/Parallel.h')
| -rw-r--r-- | contrib/libs/llvm12/include/llvm/Support/Parallel.h | 318 | 
1 files changed, 318 insertions, 0 deletions
diff --git a/contrib/libs/llvm12/include/llvm/Support/Parallel.h b/contrib/libs/llvm12/include/llvm/Support/Parallel.h new file mode 100644 index 00000000000..ac053bb4cce --- /dev/null +++ b/contrib/libs/llvm12/include/llvm/Support/Parallel.h @@ -0,0 +1,318 @@ +#pragma once + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#endif + +//===- llvm/Support/Parallel.h - Parallel algorithms ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_PARALLEL_H +#define LLVM_SUPPORT_PARALLEL_H + +#include "llvm/ADT/STLExtras.h" +#include "llvm/Config/llvm-config.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/Threading.h" + +#include <algorithm> +#include <condition_variable> +#include <functional> +#include <mutex> + +namespace llvm { + +namespace parallel { + +// Strategy for the default executor used by the parallel routines provided by +// this file. It defaults to using all hardware threads and should be +// initialized before the first use of parallel routines. +extern ThreadPoolStrategy strategy; + +namespace detail { + +#if LLVM_ENABLE_THREADS + +class Latch { +  uint32_t Count; +  mutable std::mutex Mutex; +  mutable std::condition_variable Cond; + +public: +  explicit Latch(uint32_t Count = 0) : Count(Count) {} +  ~Latch() { sync(); } + +  void inc() { +    std::lock_guard<std::mutex> lock(Mutex); +    ++Count; +  } + +  void dec() { +    std::lock_guard<std::mutex> lock(Mutex); +    if (--Count == 0) +      Cond.notify_all(); +  } + +  void sync() const { +    std::unique_lock<std::mutex> lock(Mutex); +    Cond.wait(lock, [&] { return Count == 0; }); +  } +}; + +class TaskGroup { +  Latch L; +  bool Parallel; + +public: +  TaskGroup(); +  ~TaskGroup(); + +  void spawn(std::function<void()> f); + +  void sync() const { L.sync(); } +}; + +const ptrdiff_t MinParallelSize = 1024; + +/// Inclusive median. +template <class RandomAccessIterator, class Comparator> +RandomAccessIterator medianOf3(RandomAccessIterator Start, +                               RandomAccessIterator End, +                               const Comparator &Comp) { +  RandomAccessIterator Mid = Start + (std::distance(Start, End) / 2); +  return Comp(*Start, *(End - 1)) +             ? (Comp(*Mid, *(End - 1)) ? (Comp(*Start, *Mid) ? Mid : Start) +                                       : End - 1) +             : (Comp(*Mid, *Start) ? (Comp(*(End - 1), *Mid) ? Mid : End - 1) +                                   : Start); +} + +template <class RandomAccessIterator, class Comparator> +void parallel_quick_sort(RandomAccessIterator Start, RandomAccessIterator End, +                         const Comparator &Comp, TaskGroup &TG, size_t Depth) { +  // Do a sequential sort for small inputs. +  if (std::distance(Start, End) < detail::MinParallelSize || Depth == 0) { +    llvm::sort(Start, End, Comp); +    return; +  } + +  // Partition. +  auto Pivot = medianOf3(Start, End, Comp); +  // Move Pivot to End. +  std::swap(*(End - 1), *Pivot); +  Pivot = std::partition(Start, End - 1, [&Comp, End](decltype(*Start) V) { +    return Comp(V, *(End - 1)); +  }); +  // Move Pivot to middle of partition. +  std::swap(*Pivot, *(End - 1)); + +  // Recurse. +  TG.spawn([=, &Comp, &TG] { +    parallel_quick_sort(Start, Pivot, Comp, TG, Depth - 1); +  }); +  parallel_quick_sort(Pivot + 1, End, Comp, TG, Depth - 1); +} + +template <class RandomAccessIterator, class Comparator> +void parallel_sort(RandomAccessIterator Start, RandomAccessIterator End, +                   const Comparator &Comp) { +  TaskGroup TG; +  parallel_quick_sort(Start, End, Comp, TG, +                      llvm::Log2_64(std::distance(Start, End)) + 1); +} + +// TaskGroup has a relatively high overhead, so we want to reduce +// the number of spawn() calls. We'll create up to 1024 tasks here. +// (Note that 1024 is an arbitrary number. This code probably needs +// improving to take the number of available cores into account.) +enum { MaxTasksPerGroup = 1024 }; + +template <class IterTy, class FuncTy> +void parallel_for_each(IterTy Begin, IterTy End, FuncTy Fn) { +  // Limit the number of tasks to MaxTasksPerGroup to limit job scheduling +  // overhead on large inputs. +  ptrdiff_t TaskSize = std::distance(Begin, End) / MaxTasksPerGroup; +  if (TaskSize == 0) +    TaskSize = 1; + +  TaskGroup TG; +  while (TaskSize < std::distance(Begin, End)) { +    TG.spawn([=, &Fn] { std::for_each(Begin, Begin + TaskSize, Fn); }); +    Begin += TaskSize; +  } +  std::for_each(Begin, End, Fn); +} + +template <class IndexTy, class FuncTy> +void parallel_for_each_n(IndexTy Begin, IndexTy End, FuncTy Fn) { +  // Limit the number of tasks to MaxTasksPerGroup to limit job scheduling +  // overhead on large inputs. +  ptrdiff_t TaskSize = (End - Begin) / MaxTasksPerGroup; +  if (TaskSize == 0) +    TaskSize = 1; + +  TaskGroup TG; +  IndexTy I = Begin; +  for (; I + TaskSize < End; I += TaskSize) { +    TG.spawn([=, &Fn] { +      for (IndexTy J = I, E = I + TaskSize; J != E; ++J) +        Fn(J); +    }); +  } +  for (IndexTy J = I; J < End; ++J) +    Fn(J); +} + +template <class IterTy, class ResultTy, class ReduceFuncTy, +          class TransformFuncTy> +ResultTy parallel_transform_reduce(IterTy Begin, IterTy End, ResultTy Init, +                                   ReduceFuncTy Reduce, +                                   TransformFuncTy Transform) { +  // Limit the number of tasks to MaxTasksPerGroup to limit job scheduling +  // overhead on large inputs. +  size_t NumInputs = std::distance(Begin, End); +  if (NumInputs == 0) +    return std::move(Init); +  size_t NumTasks = std::min(static_cast<size_t>(MaxTasksPerGroup), NumInputs); +  std::vector<ResultTy> Results(NumTasks, Init); +  { +    // Each task processes either TaskSize or TaskSize+1 inputs. Any inputs +    // remaining after dividing them equally amongst tasks are distributed as +    // one extra input over the first tasks. +    TaskGroup TG; +    size_t TaskSize = NumInputs / NumTasks; +    size_t RemainingInputs = NumInputs % NumTasks; +    IterTy TBegin = Begin; +    for (size_t TaskId = 0; TaskId < NumTasks; ++TaskId) { +      IterTy TEnd = TBegin + TaskSize + (TaskId < RemainingInputs ? 1 : 0); +      TG.spawn([=, &Transform, &Reduce, &Results] { +        // Reduce the result of transformation eagerly within each task. +        ResultTy R = Init; +        for (IterTy It = TBegin; It != TEnd; ++It) +          R = Reduce(R, Transform(*It)); +        Results[TaskId] = R; +      }); +      TBegin = TEnd; +    } +    assert(TBegin == End); +  } + +  // Do a final reduction. There are at most 1024 tasks, so this only adds +  // constant single-threaded overhead for large inputs. Hopefully most +  // reductions are cheaper than the transformation. +  ResultTy FinalResult = std::move(Results.front()); +  for (ResultTy &PartialResult : +       makeMutableArrayRef(Results.data() + 1, Results.size() - 1)) +    FinalResult = Reduce(FinalResult, std::move(PartialResult)); +  return std::move(FinalResult); +} + +#endif + +} // namespace detail +} // namespace parallel + +template <class RandomAccessIterator, +          class Comparator = std::less< +              typename std::iterator_traits<RandomAccessIterator>::value_type>> +void parallelSort(RandomAccessIterator Start, RandomAccessIterator End, +                  const Comparator &Comp = Comparator()) { +#if LLVM_ENABLE_THREADS +  if (parallel::strategy.ThreadsRequested != 1) { +    parallel::detail::parallel_sort(Start, End, Comp); +    return; +  } +#endif +  llvm::sort(Start, End, Comp); +} + +template <class IterTy, class FuncTy> +void parallelForEach(IterTy Begin, IterTy End, FuncTy Fn) { +#if LLVM_ENABLE_THREADS +  if (parallel::strategy.ThreadsRequested != 1) { +    parallel::detail::parallel_for_each(Begin, End, Fn); +    return; +  } +#endif +  std::for_each(Begin, End, Fn); +} + +template <class FuncTy> +void parallelForEachN(size_t Begin, size_t End, FuncTy Fn) { +#if LLVM_ENABLE_THREADS +  if (parallel::strategy.ThreadsRequested != 1) { +    parallel::detail::parallel_for_each_n(Begin, End, Fn); +    return; +  } +#endif +  for (size_t I = Begin; I != End; ++I) +    Fn(I); +} + +template <class IterTy, class ResultTy, class ReduceFuncTy, +          class TransformFuncTy> +ResultTy parallelTransformReduce(IterTy Begin, IterTy End, ResultTy Init, +                                 ReduceFuncTy Reduce, +                                 TransformFuncTy Transform) { +#if LLVM_ENABLE_THREADS +  if (parallel::strategy.ThreadsRequested != 1) { +    return parallel::detail::parallel_transform_reduce(Begin, End, Init, Reduce, +                                                       Transform); +  } +#endif +  for (IterTy I = Begin; I != End; ++I) +    Init = Reduce(std::move(Init), Transform(*I)); +  return std::move(Init); +} + +// Range wrappers. +template <class RangeTy, +          class Comparator = std::less<decltype(*std::begin(RangeTy()))>> +void parallelSort(RangeTy &&R, const Comparator &Comp = Comparator()) { +  parallelSort(std::begin(R), std::end(R), Comp); +} + +template <class RangeTy, class FuncTy> +void parallelForEach(RangeTy &&R, FuncTy Fn) { +  parallelForEach(std::begin(R), std::end(R), Fn); +} + +template <class RangeTy, class ResultTy, class ReduceFuncTy, +          class TransformFuncTy> +ResultTy parallelTransformReduce(RangeTy &&R, ResultTy Init, +                                 ReduceFuncTy Reduce, +                                 TransformFuncTy Transform) { +  return parallelTransformReduce(std::begin(R), std::end(R), Init, Reduce, +                                 Transform); +} + +// Parallel for-each, but with error handling. +template <class RangeTy, class FuncTy> +Error parallelForEachError(RangeTy &&R, FuncTy Fn) { +  // The transform_reduce algorithm requires that the initial value be copyable. +  // Error objects are uncopyable. We only need to copy initial success values, +  // so work around this mismatch via the C API. The C API represents success +  // values with a null pointer. The joinErrors discards null values and joins +  // multiple errors into an ErrorList. +  return unwrap(parallelTransformReduce( +      std::begin(R), std::end(R), wrap(Error::success()), +      [](LLVMErrorRef Lhs, LLVMErrorRef Rhs) { +        return wrap(joinErrors(unwrap(Lhs), unwrap(Rhs))); +      }, +      [&Fn](auto &&V) { return wrap(Fn(V)); })); +} + +} // namespace llvm + +#endif // LLVM_SUPPORT_PARALLEL_H + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif  | 
