diff options
author | vvvv <vvvv@ydb.tech> | 2023-02-09 19:59:45 +0300 |
---|---|---|
committer | vvvv <vvvv@ydb.tech> | 2023-02-09 19:59:45 +0300 |
commit | 698b400d09b3c1c7aff6ebf986eb3dc3ced74a08 (patch) | |
tree | 75fc45b302da8f3b029a0b8b5730cc004eb5b5aa /contrib/libs/miniselect | |
parent | abc4875e357da7c917051f7fce5138900938bbfa (diff) | |
download | ydb-698b400d09b3c1c7aff6ebf986eb3dc3ced74a08.tar.gz |
use miniselect
Для запроса
%%
pragma UseBlocks;
SELECT
RemoteIP as r
FROM
`yq-clickbench-local`.`hits_*.parquet`
WITH
(
format=parquet,
SCHEMA
(
RemoteIP INTEGER NOT NULL,
)
)
order by r limit 5
;
%%
время уменьшилось с 0.47c до 0,34с
Diffstat (limited to 'contrib/libs/miniselect')
-rw-r--r-- | contrib/libs/miniselect/AUTHORS | 2 | ||||
-rw-r--r-- | contrib/libs/miniselect/CMakeLists.darwin.txt | 14 | ||||
-rw-r--r-- | contrib/libs/miniselect/CMakeLists.linux-aarch64.txt | 15 | ||||
-rw-r--r-- | contrib/libs/miniselect/CMakeLists.linux.txt | 15 | ||||
-rw-r--r-- | contrib/libs/miniselect/CMakeLists.txt | 15 | ||||
-rw-r--r-- | contrib/libs/miniselect/LICENSE_1_0.txt | 23 | ||||
-rw-r--r-- | contrib/libs/miniselect/README.md | 274 | ||||
-rw-r--r-- | contrib/libs/miniselect/include/miniselect/floyd_rivest_select.h | 129 |
8 files changed, 487 insertions, 0 deletions
diff --git a/contrib/libs/miniselect/AUTHORS b/contrib/libs/miniselect/AUTHORS new file mode 100644 index 0000000000..896a8046a7 --- /dev/null +++ b/contrib/libs/miniselect/AUTHORS @@ -0,0 +1,2 @@ +# List of authors for copyright purposes, in no particular order +Danila Kutenin diff --git a/contrib/libs/miniselect/CMakeLists.darwin.txt b/contrib/libs/miniselect/CMakeLists.darwin.txt new file mode 100644 index 0000000000..f51760c82e --- /dev/null +++ b/contrib/libs/miniselect/CMakeLists.darwin.txt @@ -0,0 +1,14 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(contrib-libs-miniselect INTERFACE) +target_link_libraries(contrib-libs-miniselect INTERFACE + contrib-libs-cxxsupp + yutil +) diff --git a/contrib/libs/miniselect/CMakeLists.linux-aarch64.txt b/contrib/libs/miniselect/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..ff3e8d2dd3 --- /dev/null +++ b/contrib/libs/miniselect/CMakeLists.linux-aarch64.txt @@ -0,0 +1,15 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(contrib-libs-miniselect INTERFACE) +target_link_libraries(contrib-libs-miniselect INTERFACE + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil +) diff --git a/contrib/libs/miniselect/CMakeLists.linux.txt b/contrib/libs/miniselect/CMakeLists.linux.txt new file mode 100644 index 0000000000..ff3e8d2dd3 --- /dev/null +++ b/contrib/libs/miniselect/CMakeLists.linux.txt @@ -0,0 +1,15 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(contrib-libs-miniselect INTERFACE) +target_link_libraries(contrib-libs-miniselect INTERFACE + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil +) diff --git a/contrib/libs/miniselect/CMakeLists.txt b/contrib/libs/miniselect/CMakeLists.txt new file mode 100644 index 0000000000..5bb4faffb4 --- /dev/null +++ b/contrib/libs/miniselect/CMakeLists.txt @@ -0,0 +1,15 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND UNIX AND NOT APPLE AND NOT ANDROID) + include(CMakeLists.linux-aarch64.txt) +elseif (APPLE AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin.txt) +elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND UNIX AND NOT APPLE AND NOT ANDROID) + include(CMakeLists.linux.txt) +endif() diff --git a/contrib/libs/miniselect/LICENSE_1_0.txt b/contrib/libs/miniselect/LICENSE_1_0.txt new file mode 100644 index 0000000000..36b7cd93cd --- /dev/null +++ b/contrib/libs/miniselect/LICENSE_1_0.txt @@ -0,0 +1,23 @@ +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/contrib/libs/miniselect/README.md b/contrib/libs/miniselect/README.md new file mode 100644 index 0000000000..9b2a4a2233 --- /dev/null +++ b/contrib/libs/miniselect/README.md @@ -0,0 +1,274 @@ +[![Build Status](https://travis-ci.com/danlark1/miniselect.svg?branch=main)](https://travis-ci.com/danlark1/miniselect) +[![License](https://img.shields.io/badge/License-Boost%201.0-lightblue.svg)](https://www.boost.org/LICENSE_1_0.txt) + +miniselect: Generic selection and partial ordering algorithms +============================================================== + +`miniselect` is a C++ header-only library that contains various generic selection +and partial sorting algorithms with the ease of use, testing, advice on usage and +benchmarking. + +Sorting is everywhere and there are many outstanding sorting algorithms that +compete in speed, comparison count and cache friendliness. However, selection +algorithms are always a bit outside of the competition scope, they are +pretty important, for example, in databases ORDER BY LIMIT N is used extremely +often which can benefit from more optimal selection and partial sorting +algorithms. This library tries to solve this problem with Modern C++. + +* **Easy:** First-class, easy to use dependency and carefully documented APIs and algorithm properties. +* **Fast:** We do care about speed of the algorithms and provide reasonable implementations. +* **Standard compliant:** We provide C++11 compatible APIs that are compliant to the standard [`std::nth_element`](https://en.cppreference.com/w/cpp/algorithm/nth_element) and [`std::partial_sort`](https://en.cppreference.com/w/cpp/algorithm/partial_sort) functions including custom comparators and order guarantees. Just replace the names of the functions in your project and it should work! +* **Well tested:** We test all algorithms with a unified framework, under sanitizers and fuzzing. +* **Benchmarked:** We gather benchmarks for all implementations to better understand good and bad spots. + +Table of Contents +----------------- + +* [Quick Start](#quick-start) +* [Testing](#testing) +* [Documentation](#documentation) +* [Performance results](#performance-results) +* [Real-world usage](#real-world-usage) +* [Contributing](#contributing) +* [Motivation](#motivation) +* [License](#license) + +Quick Start +----------- + +You can either include this project as a cmake dependency and then use the +headers that are provided in the [include](./include) folder or just pass the +[include](./include) folder to your compiler. + +```cpp +#include <iostream> +#include <vector> + +#include "miniselect/median_of_ninthers.h" + +int main() { + std::vector<int> v = {1, 8, 4, 3, 2, 9, 0, 7, 6, 5}; + miniselect::median_of_ninthers_select(v.begin(), v.begin() + 5, v.end()); + for (const int i : v) { + std::cout << i << ' '; + } + return 0; +} +// Compile it `clang++/g++ -I$DIRECTORY/miniselect/include/ example.cpp -std=c++11 -O3 -o example +// Possible output: 0 1 4 3 2 5 8 7 6 9 +// ^ on the right place +``` + +Examples can be found in [examples](./examples). + +We support all compilers starting from GCC 7 and Clang 6. We are also planning +to support Windows, for now it is best effort but no issues are known so far. + +More on which algorithms are available, see [documentation](#documentation). +For overview of this work you can read the [article](https://danlark.org/2020/11/11/miniselect-practical-and-generic-selection-algorithms/) +in the author's blog. + +Testing +------- + +To test and benchmark, we use [Google benchmark](https://github.com/google/benchmark) library. +Simply do in the root directory: + +```console +# Check out the libraries. +$ git clone https://github.com/google/benchmark.git +$ git clone https://github.com/google/googletest.git +$ mkdir build && cd build +$ cmake -DMINISELECT_TESTING=on -DBENCHMARK_ENABLE_GTEST_TESTS=off -DBENCHMARK_ENABLE_TESTING=off .. +$ make -j +$ ctest -j4 --output-on-failure +``` + +It will create two tests and two benchmarks `test_sort`, `test_select`, +`benchmark_sort`, `benchmark_select`. Use them to validate or contribute. You +can also use `ctest`. + +Documentation +------------- + +There are several selection algorithms available, further ![\large n](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+n) is the number +of elements in the array, ![\large k](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+k) is the selection element that is needed to be found (all algorithms are deterministic and not stable unless otherwise is specified): + + +| Name | Average | Best Case | Worst Case | Comparisons | Memory | +|------------------------- |--------------------------------------------------------------------------------------------------------- |--------------------------------------------------------------------------------------------------------- |----------------------------------------------------------------------------------------------------------------------- |---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |--------------------------------------------------------------------------------------------------------------------------------- | +| [pdqselect](./include/miniselect/pdqselect.h) | ![\large O(n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%29) | ![\large O(n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%29) | ![\large O(n\log n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%5Clog+n%29) | At least ![\large 2n](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+2n). Random data ![\large 2.5n](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+2.5n) | ![\large O(1)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%281%29) | +| [Floyd-Rivest](./include/miniselect/floyd_rivest_select.h) | ![\large O(n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%29) | ![\large O(n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%29) | ![\large O(n^2 )](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%5E2+%29) | Avg: ![\large n + \min(k, n - k) + O(\sqrt{n \log n})](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+n+%2B+%5Cmin%28k%2C+n+-+k%29+%2B+O%28%5Csqrt%7Bn+%5Clog+n%7D%29) | ![\large O(\log \log n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28%5Clog+%5Clog+n%29) | +| [Median Of Medians](./include/miniselect/median_of_medians.h) | ![\large O(n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%29) | ![\large O(n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%29) | ![\large O(n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%29) | Between ![\large 2n](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+2n) and ![\large 22n](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+22n). Random data ![\large 2.5n](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+2.5n) | ![\large O(\log n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28%5Clog+n%29) | +| [Median Of Ninthers](./include/miniselect/median_of_ninthers.h) | ![\large O(n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%29) | ![\large O(n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%29) | ![\large O(n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%29) | Between ![\large 2n](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+2n) and ![\large 21n](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+21n). Random data ![\large 2n](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+2n) | ![\large O(\log n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28%5Clog+n%29) | +| [Median Of 3 Random](./include/miniselect/median_of_3_random.h) | ![\large O(n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%29) | ![\large O(n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%29) | ![\large O(n^2 )](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%5E2+%29) | At least ![\large 2n](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+2n). Random data ![\large 3n](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+3n) | ![\large O(\log n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28%5Clog+n%29) | +| [HeapSelect](./include/miniselect/heap_select.h) | ![\large O(n\log k)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%5Clog+k%29) | ![\large O(n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%29) | ![\large O(n\log k)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%5Clog+k%29) | ![\large n\log k](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+n%5Clog+k) on average, for some data patterns might be better | ![\large O(1)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%281%29) | +| [libstdc++ (introselect)](https://github.com/gcc-mirror/gcc/blob/e0af865ab9d9d5b6b3ac7fdde26cf9bbf635b6b4/libstdc%2B%2B-v3/include/bits/stl_algo.h#L4748) | ![\large O(n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%29) | ![\large O(n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%29) | ![\large O(n\log n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%5Clog+n%29) | At least ![\large 2n](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+2n). Random data ![\large 3n](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+3n) | ![\large O(1)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%281%29) | +| [libc++ (median of 3)](https://github.com/llvm/llvm-project/blob/3ed89b51da38f081fedb57727076262abb81d149/libcxx/include/algorithm#L5159) | ![\large O(n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%29) | ![\large O(n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%29) | ![\large O(n^2 )](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28n%5E2+%29) | At least ![\large 2n](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+2n). Random data ![\large 3n](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+3n) | ![\large O(1)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%281%29) | + +For sorting the situation is similar except every line adds ![\large O(k\log k)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28k%5Clog+k%29) comparisons and pdqselect is using ![\large O(\log n)](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+O%28%5Clog+n%29) memory. + +## API + +All functions end either in `select`, either in `partial_sort` and +their behavior is exactly the same as for +[`std::nth_element`](https://en.cppreference.com/w/cpp/algorithm/nth_element) +and [`std::partial_sort`](https://en.cppreference.com/w/cpp/algorithm/partial_sort) +respectively, i.e. they accept 3 arguments as `first`, `middle`, `end` iterators +and an optional comparator. Several notes: + +* You should not throw exceptions from `Compare` function. Standard library +also does not specify the behavior in that matter. +* We don't support ParallelSTL for now. +* C++20 constexpr specifiers might be added but currently we don't have them +because of some floating point math in several algorithms. +* All functions are in the `miniselect` namespace. See the example for that. + +- pdqselect + - This algorithm is based on [`pdqsort`](https://github.com/orlp/pdqsort) which is acknowledged as one of the fastest generic sort algorithms. + - **Location:** [`miniselect/pdqselect.h`](./include/miniselect/pdqselect.h). + - **Functions:** `pdqselect`, `pdqselect_branchless`, `pdqpartial_sort`, `pdqpartial_sort_branchless`. Branchless version uses branchless partition algorithm provided by [`pdqsort`](https://github.com/orlp/pdqsort). Use it if your comparison function is branchless, it might give performance for very big ranges. + - **Performance advice:** Use it when you need to sort a big chunk so that ![\large k](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+k) is close to ![\large n](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+n). + +<p align="center"><img src="https://media.giphy.com/media/TXIm9rTmbmox5ceSyP/giphy.gif" /></p> + +- Floyd-Rivest + - This algorithm is based on [Floyd-Rivest algorithm](https://en.wikipedia.org/wiki/Floyd%E2%80%93Rivest_algorithm). + - **Location:** [`miniselect/floyd_rivest_select.h`](./include/miniselect/floyd_rivest_select.h). + - **Functions:** `floyd_rivest_select`, `floyd_rivest_partial_sort`. + - **Performance advice:** Given that this algorithm performs as one of the best on average case in terms of comparisons and speed, we highly advise to + at least try this in your project. Especially it is good for small ![\large k](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+k) or types that are expensive to compare (for example, strings). But even for median the benchmarks show it outperforms others. It is not easy for this algorithm to build a reasonable worst case but one of examples when this algorithm does not perform well is when there are lots of similar values of linear size (random01 dataset showed some moderate penalties). + +We present here two gifs, for median and for ![\large k = n / 10](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+k+%3D+n+%2F+10) order statistic. + +<p float="left"> + <img src="https://media.giphy.com/media/a5ORb22iMCE0a6D2cf/giphy.gif" width="48%" /> + <img src="https://media.giphy.com/media/Gpk4c9pHMJLbjugDmZ/giphy.gif" width="48%" /> +</p> + +- Median Of Medians + - This algorithm is based on [Median of Medians](https://en.wikipedia.org/wiki/Median_of_medians) algorithm, one of the first deterministic linear time worst case median algorithm. + - **Location:** [`miniselect/median_of_medians.h`](./include/miniselect/median_of_medians.h). + - **Functions:** `median_of_medians_select`, `median_of_medians_partial_sort`. + - **Performance advice:** This algorithm does not show advantages over others, implemented for historical reasons and for bechmarking. + +<p align="center"><img src="https://media.giphy.com/media/C0txh78ngyEGqmrX7c/giphy.gif" /></p> + +- Median Of Ninthers + - This algorithm is based on [Fast Deterministic Selection](https://erdani.com/research/sea2017.pdf) paper by Andrei Alexandrescu, one of the latest and fastest deterministic linear time worst case median algorithms. + - **Location:** [`miniselect/median_of_ninthers.h`](./include/miniselect/median_of_ninthers.h). + - **Functions:** `median_of_ninthers_select`, `median_of_ninthers_partial_sort`. + - **Performance advice:** Use this algorithm if you absolutely need linear time worst case scenario for selection algorithm. This algorithm shows some strengths over other deterministic [`PICK`](https://en.wikipedia.org/wiki/Median_of_medians) algorithms and has lower constanst than MedianOfMedians. + +<p align="center"><img src="https://media.giphy.com/media/usKlqJoh1WVLWLU9Dt/giphy.gif" /></p> + +- Median Of 3 Random + - This algorithm is based on QuickSelect with the random median of 3 pivot choice algorithm (it chooses random 3 elements in the range and takes the middle value). It is a randomized algorithm. + - **Location:** [`miniselect/median_of_3_random.h`](./include/miniselect/median_of_3_random.h). + - **Functions:** `median_of_3_random_select`, `median_of_3_random_partial_sort`. + - **Performance advice:** This is a randomized algorithm and also it did not show any strengths against Median Of Ninthers. + +<p align="center"><img src="https://media.giphy.com/media/GrbIu6PvrMuvoowp3U/giphy.gif" /></p> + +- Introselect + - This algorithm is based on [Introselect](https://en.wikipedia.org/wiki/Introselect) algorithm, it is used in libstdc++ in `std::nth_element`, however instead of falling back to MedianOfMedians it is using HeapSelect which adds logarithm to its worst complexity. + - **Location:** `<algorithm>`. + - **Functions:** `std::nth_element`. + - **Performance advice:** This algorithm is used in standard library and is not recommended to use if you are looking for performance. + +<p align="center"><img src="https://media.giphy.com/media/VOBM4MVBpiTgkbA6CH/giphy.gif" /></p> + +- Median Of 3 + - This algorithm is based on QuickSelect with median of 3 pivot choice algorithm (the middle value between begin, mid and end values), it is used in libc++ in `std::nth_element`. + - **Location:** `<algorithm>`. + - **Functions:** `std::nth_element`. + - **Performance advice:** This algorithm is used in standard library and is not recommended to use if you are looking for performance. + +<p align="center"><img src="https://media.giphy.com/media/03eJ0S7H79Jdtrv49F/giphy.gif" /></p> + +- `std::partial_sort` or `HeapSelect` + - This algorithm has [heap-based solutions](https://en.wikipedia.org/wiki/Partial_sorting) both in libc++ and libstdc++, from the first ![\large k](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+k) elements the max heap is built, then one by one the elements are trying to be pushed to that heap with HeapSort in the end. + - **Location:** `<algorithm>`, [`miniselect/heap_select.h`](./include/miniselect/heap_select.h). + - **Functions:** `std::partial_sort`, `heap_select`, `heap_partial_sort`. + - **Performance advice:** This algorithm is very good for random data and small ![\large k](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+k) and might outperform all selection+sort algorithms. However, for descending data it starts to significantly degrade and is not recommended for use if you have such patterns in real data. + +<p align="center"><img src="https://media.giphy.com/media/MAw3Tk2TDxrnv6vLlu/giphy.gif" /></p> + +## Other algorithms to come + +* Kiwiel modification of FloydRivest algorithm which is described in [On Floyd and Rivest’s SELECT algorithm](https://core.ac.uk/download/pdf/82672439.pdf) with ternary and quintary pivots. +* Combination of FloydRivest and pdqsort pivot strategies, currently all experiments did not show any boost. + +Performance results +------------------- + +We use 10 datasets and 8 algorithms with 10000000 elements to find median and +other ![\large k](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+k) on `Intel(R) Core(TM) i5-4200H CPU @ 2.80GHz` for `std::vector<int>`, +for median the benchmarks are the following: + +![median](benches/plots/result_10000000_5000000.png) + +![median](benches/plots/result_comparisons_10000000_5000000.png) + +![median](benches/plots/result_accesses_10000000_5000000.png) + +For smaller ![\large k](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+k), +for example, 1000, the results are the following + +![k equals 1000](benches/plots/result_10000000_1000.png) + +![k equals 1000](benches/plots/result_comparisons_10000000_1000.png) + +![k equals 1000](benches/plots/result_accesses_10000000_1000.png) + +Other benchmarks can be found [here](https://drive.google.com/drive/folders/1DHEaeXgZuX6AJ9eByeZ8iQVQv0ueP8XM). + + +Real-world usage +---------------- + +- [Yandex ClickHouse](https://github.com/yandex/ClickHouse) + +If you are planning to use miniselect in your product, please work from one of +our releases and if you wish, you can write the acknowledgment in this section +for visibility. + +Contributing +------------ + +Patches are welcome with new algorithms! You should add the selection algorithm +together with the partial sorting algorithm in [include](./include), add +tests in [testing](./testing) and ideally run benchmarks to see how it performs. +If you also have some data cases to test against, we would be more than happy +to merge them. + +Motivation +---------- + +Firstly the author was interested if any research had been done for small ![\large k](https://render.githubusercontent.com/render/math?math=%5Cdisplaystyle+%5Clarge+k) +in selection algorithms and was struggling to find working implementations to +compare different approaches from standard library and quickselect algorithms. +After that it turned out that the problem is much more interesting than it looks +like and after reading The Art of Computer Programming from Donald Knuth about +minimum comparison sorting and selection algorithms the author decided to look +through all non-popular algorithms and try them out. + +The author have not found any decent library for selection algorithms and little +research is published in open source, so that they decided to merge all that +implementations and compare them with possible merging of different ideas +into a decent one algorithm for most needs. For a big story of adventures see +the author's [blog post](https://danlark.org/2020/11/11/miniselect-practical-and-generic-selection-algorithms/). + +License +------- + +The code is made available under the [Boost License 1.0](https://boost.org/LICENSE_1_0.txt). + +Third-Party Libraries Used and Adjusted +--------------------------------------- + +| Library | License | +|---------------------|--------------------------------------------------------------------------------------------------| +| pdqsort | [MIT](https://github.com/orlp/pdqsort/blob/47a46767d76fc852284eaa083e4b7034ee6e2559/license.txt) | +| MedianOfNinthers | [Boost License 1.0](https://github.com/andralex/MedianOfNinthers/blob/master/LICENSE_1_0.txt) | + diff --git a/contrib/libs/miniselect/include/miniselect/floyd_rivest_select.h b/contrib/libs/miniselect/include/miniselect/floyd_rivest_select.h new file mode 100644 index 0000000000..0dcdcd500c --- /dev/null +++ b/contrib/libs/miniselect/include/miniselect/floyd_rivest_select.h @@ -0,0 +1,129 @@ +/* Copyright Danila Kutenin, 2020-. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * https://boost.org/LICENSE_1_0.txt) + */ +#pragma once + +#include <algorithm> +#include <cmath> +#include <cstddef> +#include <functional> +#include <iterator> +#include <type_traits> +#include <utility> + +namespace miniselect { +namespace floyd_rivest_detail { + +enum floyd_rivest_constants { + kQCap = 600, +}; + +template <class Compare> +struct CompareRefType { + // Pass the comparator by lvalue reference. Or in debug mode, using a + // debugging wrapper that stores a reference. + using type = typename std::add_lvalue_reference<Compare>::type; +}; + +template <class Iter, class Compare, + class DiffType = typename std::iterator_traits<Iter>::difference_type> +inline void floyd_rivest_select_loop(Iter begin, DiffType left, DiffType right, + DiffType k, Compare comp) { + while (right > left) { + DiffType size = right - left; + if (size > floyd_rivest_constants::kQCap) { + DiffType n = right - left + 1; + DiffType i = k - left + 1; + + double z = log(n); + double s = 0.5 * exp(2 * z / 3); + double sd = 0.5 * sqrt(z * s * (n - s) / n); + if (i < n / 2) { + sd *= -1.0; + } + DiffType new_left = + std::max(left, static_cast<DiffType>(k - i * s / n + sd)); + DiffType new_right = + std::min(right, static_cast<DiffType>(k + (n - i) * s / n + sd)); + floyd_rivest_select_loop<Iter, Compare, DiffType>(begin, new_left, + new_right, k, comp); + } + DiffType i = left; + DiffType j = right; + + std::swap(begin[left], begin[k]); + const bool to_swap = comp(begin[left], begin[right]); + if (to_swap) { + std::swap(begin[left], begin[right]); + } + // Make sure that non copyable types compile. + const auto& t = to_swap ? begin[left] : begin[right]; + while (i < j) { + std::swap(begin[i], begin[j]); + i++; + j--; + while (comp(begin[i], t)) { + i++; + } + while (comp(t, begin[j])) { + j--; + } + } + + if (to_swap) { + std::swap(begin[left], begin[j]); + } else { + j++; + std::swap(begin[right], begin[j]); + } + + if (j <= k) { + left = j + 1; + } + if (k <= j) { + right = j - 1; + } + } +} + +} // namespace floyd_rivest_detail + +template <class Iter, class Compare> +inline void floyd_rivest_partial_sort(Iter begin, Iter mid, Iter end, + Compare comp) { + if (begin == end) return; + if (begin == mid) return; + using CompType = typename floyd_rivest_detail::CompareRefType<Compare>::type; + using DiffType = typename std::iterator_traits<Iter>::difference_type; + floyd_rivest_detail::floyd_rivest_select_loop<Iter, CompType>( + begin, DiffType{0}, static_cast<DiffType>(end - begin - 1), + static_cast<DiffType>(mid - begin - 1), comp); + // std::sort proved to be better than other sorts because of pivoting. + std::sort<Iter, CompType>(begin, mid, comp); +} + +template <class Iter> +inline void floyd_rivest_partial_sort(Iter begin, Iter mid, Iter end) { + typedef typename std::iterator_traits<Iter>::value_type T; + floyd_rivest_partial_sort(begin, mid, end, std::less<T>()); +} + +template <class Iter, class Compare> +inline void floyd_rivest_select(Iter begin, Iter mid, Iter end, Compare comp) { + if (mid == end) return; + using CompType = typename floyd_rivest_detail::CompareRefType<Compare>::type; + using DiffType = typename std::iterator_traits<Iter>::difference_type; + floyd_rivest_detail::floyd_rivest_select_loop<Iter, CompType>( + begin, DiffType{0}, static_cast<DiffType>(end - begin - 1), + static_cast<DiffType>(mid - begin), comp); +} + +template <class Iter> +inline void floyd_rivest_select(Iter begin, Iter mid, Iter end) { + typedef typename std::iterator_traits<Iter>::value_type T; + floyd_rivest_select(begin, mid, end, std::less<T>()); +} + +} // namespace miniselect |