diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /contrib/libs/libxml/timsort.h | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'contrib/libs/libxml/timsort.h')
-rw-r--r-- | contrib/libs/libxml/timsort.h | 601 |
1 files changed, 601 insertions, 0 deletions
diff --git a/contrib/libs/libxml/timsort.h b/contrib/libs/libxml/timsort.h new file mode 100644 index 0000000000..383584fb35 --- /dev/null +++ b/contrib/libs/libxml/timsort.h @@ -0,0 +1,601 @@ +/* + * Taken from https://github.com/swenson/sort + * Revision: 05fd77bfec049ce8b7c408c4d3dd2d51ee061a15 + * Removed all code unrelated to Timsort and made minor adjustments for + * cross-platform compatibility. + */ + +/* + * The MIT License (MIT) + * + * Copyright (c) 2010-2017 Christopher Swenson. + * Copyright (c) 2012 Vojtech Fried. + * Copyright (c) 2012 Google Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#ifdef HAVE_STDINT_H +#include <stdint.h> +#elif defined(_WIN32) +typedef unsigned __int64 uint64_t; +#endif + +#ifndef SORT_NAME +#error "Must declare SORT_NAME" +#endif + +#ifndef SORT_TYPE +#error "Must declare SORT_TYPE" +#endif + +#ifndef SORT_CMP +#define SORT_CMP(x, y) ((x) < (y) ? -1 : ((x) == (y) ? 0 : 1)) +#endif + +#ifndef TIM_SORT_STACK_SIZE +#define TIM_SORT_STACK_SIZE 128 +#endif + +#define SORT_SWAP(x,y) {SORT_TYPE __SORT_SWAP_t = (x); (x) = (y); (y) = __SORT_SWAP_t;} + + +/* Common, type-agnostic functions and constants that we don't want to declare twice. */ +#ifndef SORT_COMMON_H +#define SORT_COMMON_H + +#ifndef MAX +#define MAX(x,y) (((x) > (y) ? (x) : (y))) +#endif + +#ifndef MIN +#define MIN(x,y) (((x) < (y) ? (x) : (y))) +#endif + +static int compute_minrun(const uint64_t); + +#ifndef CLZ +#if defined(__GNUC__) && ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ > 3)) +#define CLZ __builtin_clzll +#else + +static int clzll(uint64_t); + +/* adapted from Hacker's Delight */ +static int clzll(uint64_t x) { + int n; + + if (x == 0) { + return 64; + } + + n = 0; + + if (x <= 0x00000000FFFFFFFFL) { + n = n + 32; + x = x << 32; + } + + if (x <= 0x0000FFFFFFFFFFFFL) { + n = n + 16; + x = x << 16; + } + + if (x <= 0x00FFFFFFFFFFFFFFL) { + n = n + 8; + x = x << 8; + } + + if (x <= 0x0FFFFFFFFFFFFFFFL) { + n = n + 4; + x = x << 4; + } + + if (x <= 0x3FFFFFFFFFFFFFFFL) { + n = n + 2; + x = x << 2; + } + + if (x <= 0x7FFFFFFFFFFFFFFFL) { + n = n + 1; + } + + return n; +} + +#define CLZ clzll +#endif +#endif + +static __inline int compute_minrun(const uint64_t size) { + const int top_bit = 64 - CLZ(size); + const int shift = MAX(top_bit, 6) - 6; + const int minrun = size >> shift; + const uint64_t mask = (1ULL << shift) - 1; + + if (mask & size) { + return minrun + 1; + } + + return minrun; +} + +#endif /* SORT_COMMON_H */ + +#define SORT_CONCAT(x, y) x ## _ ## y +#define SORT_MAKE_STR1(x, y) SORT_CONCAT(x,y) +#define SORT_MAKE_STR(x) SORT_MAKE_STR1(SORT_NAME,x) + +#define BINARY_INSERTION_FIND SORT_MAKE_STR(binary_insertion_find) +#define BINARY_INSERTION_SORT_START SORT_MAKE_STR(binary_insertion_sort_start) +#define BINARY_INSERTION_SORT SORT_MAKE_STR(binary_insertion_sort) +#define REVERSE_ELEMENTS SORT_MAKE_STR(reverse_elements) +#define COUNT_RUN SORT_MAKE_STR(count_run) +#define CHECK_INVARIANT SORT_MAKE_STR(check_invariant) +#define TIM_SORT SORT_MAKE_STR(tim_sort) +#define TIM_SORT_RESIZE SORT_MAKE_STR(tim_sort_resize) +#define TIM_SORT_MERGE SORT_MAKE_STR(tim_sort_merge) +#define TIM_SORT_COLLAPSE SORT_MAKE_STR(tim_sort_collapse) + +#ifndef MAX +#define MAX(x,y) (((x) > (y) ? (x) : (y))) +#endif +#ifndef MIN +#define MIN(x,y) (((x) < (y) ? (x) : (y))) +#endif + +typedef struct { + size_t start; + size_t length; +} TIM_SORT_RUN_T; + + +void BINARY_INSERTION_SORT(SORT_TYPE *dst, const size_t size); +void TIM_SORT(SORT_TYPE *dst, const size_t size); + + +/* Function used to do a binary search for binary insertion sort */ +static __inline size_t BINARY_INSERTION_FIND(SORT_TYPE *dst, const SORT_TYPE x, + const size_t size) { + size_t l, c, r; + SORT_TYPE cx; + l = 0; + r = size - 1; + c = r >> 1; + + /* check for out of bounds at the beginning. */ + if (SORT_CMP(x, dst[0]) < 0) { + return 0; + } else if (SORT_CMP(x, dst[r]) > 0) { + return r; + } + + cx = dst[c]; + + while (1) { + const int val = SORT_CMP(x, cx); + + if (val < 0) { + if (c - l <= 1) { + return c; + } + + r = c; + } else { /* allow = for stability. The binary search favors the right. */ + if (r - c <= 1) { + return c + 1; + } + + l = c; + } + + c = l + ((r - l) >> 1); + cx = dst[c]; + } +} + +/* Binary insertion sort, but knowing that the first "start" entries are sorted. Used in timsort. */ +static void BINARY_INSERTION_SORT_START(SORT_TYPE *dst, const size_t start, const size_t size) { + size_t i; + + for (i = start; i < size; i++) { + size_t j; + SORT_TYPE x; + size_t location; + + /* If this entry is already correct, just move along */ + if (SORT_CMP(dst[i - 1], dst[i]) <= 0) { + continue; + } + + /* Else we need to find the right place, shift everything over, and squeeze in */ + x = dst[i]; + location = BINARY_INSERTION_FIND(dst, x, i); + + for (j = i - 1; j >= location; j--) { + dst[j + 1] = dst[j]; + + if (j == 0) { /* check edge case because j is unsigned */ + break; + } + } + + dst[location] = x; + } +} + +/* Binary insertion sort */ +void BINARY_INSERTION_SORT(SORT_TYPE *dst, const size_t size) { + /* don't bother sorting an array of size <= 1 */ + if (size <= 1) { + return; + } + + BINARY_INSERTION_SORT_START(dst, 1, size); +} + +/* timsort implementation, based on timsort.txt */ + +static __inline void REVERSE_ELEMENTS(SORT_TYPE *dst, size_t start, size_t end) { + while (1) { + if (start >= end) { + return; + } + + SORT_SWAP(dst[start], dst[end]); + start++; + end--; + } +} + +static size_t COUNT_RUN(SORT_TYPE *dst, const size_t start, const size_t size) { + size_t curr; + + if (size - start == 1) { + return 1; + } + + if (start >= size - 2) { + if (SORT_CMP(dst[size - 2], dst[size - 1]) > 0) { + SORT_SWAP(dst[size - 2], dst[size - 1]); + } + + return 2; + } + + curr = start + 2; + + if (SORT_CMP(dst[start], dst[start + 1]) <= 0) { + /* increasing run */ + while (1) { + if (curr == size - 1) { + break; + } + + if (SORT_CMP(dst[curr - 1], dst[curr]) > 0) { + break; + } + + curr++; + } + + return curr - start; + } else { + /* decreasing run */ + while (1) { + if (curr == size - 1) { + break; + } + + if (SORT_CMP(dst[curr - 1], dst[curr]) <= 0) { + break; + } + + curr++; + } + + /* reverse in-place */ + REVERSE_ELEMENTS(dst, start, curr - 1); + return curr - start; + } +} + +static int CHECK_INVARIANT(TIM_SORT_RUN_T *stack, const int stack_curr) { + size_t A, B, C; + + if (stack_curr < 2) { + return 1; + } + + if (stack_curr == 2) { + const size_t A1 = stack[stack_curr - 2].length; + const size_t B1 = stack[stack_curr - 1].length; + + if (A1 <= B1) { + return 0; + } + + return 1; + } + + A = stack[stack_curr - 3].length; + B = stack[stack_curr - 2].length; + C = stack[stack_curr - 1].length; + + if ((A <= B + C) || (B <= C)) { + return 0; + } + + return 1; +} + +typedef struct { + size_t alloc; + SORT_TYPE *storage; +} TEMP_STORAGE_T; + +static void TIM_SORT_RESIZE(TEMP_STORAGE_T *store, const size_t new_size) { + if (store->alloc < new_size) { + SORT_TYPE *tempstore = (SORT_TYPE *)realloc(store->storage, new_size * sizeof(SORT_TYPE)); + + if (tempstore == NULL) { + fprintf(stderr, "Error allocating temporary storage for tim sort: need %lu bytes", + (unsigned long)(sizeof(SORT_TYPE) * new_size)); + exit(1); + } + + store->storage = tempstore; + store->alloc = new_size; + } +} + +static void TIM_SORT_MERGE(SORT_TYPE *dst, const TIM_SORT_RUN_T *stack, const int stack_curr, + TEMP_STORAGE_T *store) { + const size_t A = stack[stack_curr - 2].length; + const size_t B = stack[stack_curr - 1].length; + const size_t curr = stack[stack_curr - 2].start; + SORT_TYPE *storage; + size_t i, j, k; + TIM_SORT_RESIZE(store, MIN(A, B)); + storage = store->storage; + + /* left merge */ + if (A < B) { + memcpy(storage, &dst[curr], A * sizeof(SORT_TYPE)); + i = 0; + j = curr + A; + + for (k = curr; k < curr + A + B; k++) { + if ((i < A) && (j < curr + A + B)) { + if (SORT_CMP(storage[i], dst[j]) <= 0) { + dst[k] = storage[i++]; + } else { + dst[k] = dst[j++]; + } + } else if (i < A) { + dst[k] = storage[i++]; + } else { + break; + } + } + } else { + /* right merge */ + memcpy(storage, &dst[curr + A], B * sizeof(SORT_TYPE)); + i = B; + j = curr + A; + k = curr + A + B; + + while (k > curr) { + k--; + if ((i > 0) && (j > curr)) { + if (SORT_CMP(dst[j - 1], storage[i - 1]) > 0) { + dst[k] = dst[--j]; + } else { + dst[k] = storage[--i]; + } + } else if (i > 0) { + dst[k] = storage[--i]; + } else { + break; + } + } + } +} + +static int TIM_SORT_COLLAPSE(SORT_TYPE *dst, TIM_SORT_RUN_T *stack, int stack_curr, + TEMP_STORAGE_T *store, const size_t size) { + while (1) { + size_t A, B, C, D; + int ABC, BCD, CD; + + /* if the stack only has one thing on it, we are done with the collapse */ + if (stack_curr <= 1) { + break; + } + + /* if this is the last merge, just do it */ + if ((stack_curr == 2) && (stack[0].length + stack[1].length == size)) { + TIM_SORT_MERGE(dst, stack, stack_curr, store); + stack[0].length += stack[1].length; + stack_curr--; + break; + } + /* check if the invariant is off for a stack of 2 elements */ + else if ((stack_curr == 2) && (stack[0].length <= stack[1].length)) { + TIM_SORT_MERGE(dst, stack, stack_curr, store); + stack[0].length += stack[1].length; + stack_curr--; + break; + } else if (stack_curr == 2) { + break; + } + + B = stack[stack_curr - 3].length; + C = stack[stack_curr - 2].length; + D = stack[stack_curr - 1].length; + + if (stack_curr >= 4) { + A = stack[stack_curr - 4].length; + ABC = (A <= B + C); + } else { + ABC = 0; + } + + BCD = (B <= C + D) || ABC; + CD = (C <= D); + + /* Both invariants are good */ + if (!BCD && !CD) { + break; + } + + /* left merge */ + if (BCD && !CD) { + TIM_SORT_MERGE(dst, stack, stack_curr - 1, store); + stack[stack_curr - 3].length += stack[stack_curr - 2].length; + stack[stack_curr - 2] = stack[stack_curr - 1]; + stack_curr--; + } else { + /* right merge */ + TIM_SORT_MERGE(dst, stack, stack_curr, store); + stack[stack_curr - 2].length += stack[stack_curr - 1].length; + stack_curr--; + } + } + + return stack_curr; +} + +static __inline int PUSH_NEXT(SORT_TYPE *dst, + const size_t size, + TEMP_STORAGE_T *store, + const size_t minrun, + TIM_SORT_RUN_T *run_stack, + size_t *stack_curr, + size_t *curr) { + size_t len = COUNT_RUN(dst, *curr, size); + size_t run = minrun; + + if (run > size - *curr) { + run = size - *curr; + } + + if (run > len) { + BINARY_INSERTION_SORT_START(&dst[*curr], len, run); + len = run; + } + + run_stack[*stack_curr].start = *curr; + run_stack[*stack_curr].length = len; + (*stack_curr)++; + *curr += len; + + if (*curr == size) { + /* finish up */ + while (*stack_curr > 1) { + TIM_SORT_MERGE(dst, run_stack, *stack_curr, store); + run_stack[*stack_curr - 2].length += run_stack[*stack_curr - 1].length; + (*stack_curr)--; + } + + if (store->storage != NULL) { + free(store->storage); + store->storage = NULL; + } + + return 0; + } + + return 1; +} + +void TIM_SORT(SORT_TYPE *dst, const size_t size) { + size_t minrun; + TEMP_STORAGE_T _store, *store; + TIM_SORT_RUN_T run_stack[TIM_SORT_STACK_SIZE]; + size_t stack_curr = 0; + size_t curr = 0; + + /* don't bother sorting an array of size 1 */ + if (size <= 1) { + return; + } + + if (size < 64) { + BINARY_INSERTION_SORT(dst, size); + return; + } + + /* compute the minimum run length */ + minrun = compute_minrun(size); + /* temporary storage for merges */ + store = &_store; + store->alloc = 0; + store->storage = NULL; + + if (!PUSH_NEXT(dst, size, store, minrun, run_stack, &stack_curr, &curr)) { + return; + } + + if (!PUSH_NEXT(dst, size, store, minrun, run_stack, &stack_curr, &curr)) { + return; + } + + if (!PUSH_NEXT(dst, size, store, minrun, run_stack, &stack_curr, &curr)) { + return; + } + + while (1) { + if (!CHECK_INVARIANT(run_stack, stack_curr)) { + stack_curr = TIM_SORT_COLLAPSE(dst, run_stack, stack_curr, store, size); + continue; + } + + if (!PUSH_NEXT(dst, size, store, minrun, run_stack, &stack_curr, &curr)) { + return; + } + } +} + +#undef SORT_CONCAT +#undef SORT_MAKE_STR1 +#undef SORT_MAKE_STR +#undef SORT_NAME +#undef SORT_TYPE +#undef SORT_CMP +#undef TEMP_STORAGE_T +#undef TIM_SORT_RUN_T +#undef PUSH_NEXT +#undef SORT_SWAP +#undef SORT_CONCAT +#undef SORT_MAKE_STR1 +#undef SORT_MAKE_STR +#undef BINARY_INSERTION_FIND +#undef BINARY_INSERTION_SORT_START +#undef BINARY_INSERTION_SORT +#undef REVERSE_ELEMENTS +#undef COUNT_RUN +#undef TIM_SORT +#undef TIM_SORT_RESIZE +#undef TIM_SORT_COLLAPSE +#undef TIM_SORT_RUN_T +#undef TEMP_STORAGE_T |