diff options
author | vvvv <vvvv@ydb.tech> | 2023-07-31 18:21:04 +0300 |
---|---|---|
committer | vvvv <vvvv@ydb.tech> | 2023-07-31 18:21:04 +0300 |
commit | dec41c40e51aa407edef81a3c566a5a15780fc49 (patch) | |
tree | 4f197b596b32f35eca368121f0dff913419da9af /library/cpp | |
parent | 3ca8b54c96e09eb2b65be7f09675623438d559c7 (diff) | |
download | ydb-dec41c40e51aa407edef81a3c566a5a15780fc49.tar.gz |
YQL-16239 Move purecalc to public
Diffstat (limited to 'library/cpp')
286 files changed, 23598 insertions, 0 deletions
diff --git a/library/cpp/CMakeLists.darwin-x86_64.txt b/library/cpp/CMakeLists.darwin-x86_64.txt index 772027a342..5497fd21be 100644 --- a/library/cpp/CMakeLists.darwin-x86_64.txt +++ b/library/cpp/CMakeLists.darwin-x86_64.txt @@ -36,6 +36,9 @@ add_subdirectory(disjoint_sets) add_subdirectory(dns) add_subdirectory(enumbitset) add_subdirectory(execprofile) +add_subdirectory(geo) +add_subdirectory(geobase) +add_subdirectory(geohash) add_subdirectory(getopt) add_subdirectory(grpc) add_subdirectory(histogram) @@ -44,9 +47,11 @@ add_subdirectory(http) add_subdirectory(hyperloglog) add_subdirectory(int128) add_subdirectory(ipmath) +add_subdirectory(ipreg) add_subdirectory(ipv6_address) add_subdirectory(iterator) add_subdirectory(json) +add_subdirectory(langmask) add_subdirectory(lcs) add_subdirectory(lfalloc) add_subdirectory(linear_regression) @@ -55,6 +60,7 @@ add_subdirectory(lua) add_subdirectory(lwtrace) add_subdirectory(malloc) add_subdirectory(messagebus) +add_subdirectory(microbdb) add_subdirectory(mime) add_subdirectory(monlib) add_subdirectory(on_disk) @@ -68,6 +74,8 @@ add_subdirectory(random_provider) add_subdirectory(regex) add_subdirectory(resource) add_subdirectory(retry) +add_subdirectory(reverse_geocoder) +add_subdirectory(robots_txt) add_subdirectory(sanitizer) add_subdirectory(scheme) add_subdirectory(sighandler) @@ -90,6 +98,7 @@ add_subdirectory(unified_agent_client) add_subdirectory(uri) add_subdirectory(xml) add_subdirectory(yaml) +add_subdirectory(yconf) add_subdirectory(yson) add_subdirectory(yson_pull) add_subdirectory(yt) diff --git a/library/cpp/CMakeLists.linux-aarch64.txt b/library/cpp/CMakeLists.linux-aarch64.txt index cd50b0e3a4..5e93629802 100644 --- a/library/cpp/CMakeLists.linux-aarch64.txt +++ b/library/cpp/CMakeLists.linux-aarch64.txt @@ -35,6 +35,9 @@ add_subdirectory(disjoint_sets) add_subdirectory(dns) add_subdirectory(enumbitset) add_subdirectory(execprofile) +add_subdirectory(geo) +add_subdirectory(geobase) +add_subdirectory(geohash) add_subdirectory(getopt) add_subdirectory(grpc) add_subdirectory(histogram) @@ -43,9 +46,11 @@ add_subdirectory(http) add_subdirectory(hyperloglog) add_subdirectory(int128) add_subdirectory(ipmath) +add_subdirectory(ipreg) add_subdirectory(ipv6_address) add_subdirectory(iterator) add_subdirectory(json) +add_subdirectory(langmask) add_subdirectory(lcs) add_subdirectory(lfalloc) add_subdirectory(linear_regression) @@ -54,6 +59,7 @@ add_subdirectory(lua) add_subdirectory(lwtrace) add_subdirectory(malloc) add_subdirectory(messagebus) +add_subdirectory(microbdb) add_subdirectory(mime) add_subdirectory(monlib) add_subdirectory(on_disk) @@ -67,6 +73,8 @@ add_subdirectory(random_provider) add_subdirectory(regex) add_subdirectory(resource) add_subdirectory(retry) +add_subdirectory(reverse_geocoder) +add_subdirectory(robots_txt) add_subdirectory(sanitizer) add_subdirectory(scheme) add_subdirectory(sighandler) @@ -89,6 +97,7 @@ add_subdirectory(unified_agent_client) add_subdirectory(uri) add_subdirectory(xml) add_subdirectory(yaml) +add_subdirectory(yconf) add_subdirectory(yson) add_subdirectory(yson_pull) add_subdirectory(yt) diff --git a/library/cpp/CMakeLists.linux-x86_64.txt b/library/cpp/CMakeLists.linux-x86_64.txt index 772027a342..5497fd21be 100644 --- a/library/cpp/CMakeLists.linux-x86_64.txt +++ b/library/cpp/CMakeLists.linux-x86_64.txt @@ -36,6 +36,9 @@ add_subdirectory(disjoint_sets) add_subdirectory(dns) add_subdirectory(enumbitset) add_subdirectory(execprofile) +add_subdirectory(geo) +add_subdirectory(geobase) +add_subdirectory(geohash) add_subdirectory(getopt) add_subdirectory(grpc) add_subdirectory(histogram) @@ -44,9 +47,11 @@ add_subdirectory(http) add_subdirectory(hyperloglog) add_subdirectory(int128) add_subdirectory(ipmath) +add_subdirectory(ipreg) add_subdirectory(ipv6_address) add_subdirectory(iterator) add_subdirectory(json) +add_subdirectory(langmask) add_subdirectory(lcs) add_subdirectory(lfalloc) add_subdirectory(linear_regression) @@ -55,6 +60,7 @@ add_subdirectory(lua) add_subdirectory(lwtrace) add_subdirectory(malloc) add_subdirectory(messagebus) +add_subdirectory(microbdb) add_subdirectory(mime) add_subdirectory(monlib) add_subdirectory(on_disk) @@ -68,6 +74,8 @@ add_subdirectory(random_provider) add_subdirectory(regex) add_subdirectory(resource) add_subdirectory(retry) +add_subdirectory(reverse_geocoder) +add_subdirectory(robots_txt) add_subdirectory(sanitizer) add_subdirectory(scheme) add_subdirectory(sighandler) @@ -90,6 +98,7 @@ add_subdirectory(unified_agent_client) add_subdirectory(uri) add_subdirectory(xml) add_subdirectory(yaml) +add_subdirectory(yconf) add_subdirectory(yson) add_subdirectory(yson_pull) add_subdirectory(yt) diff --git a/library/cpp/CMakeLists.windows-x86_64.txt b/library/cpp/CMakeLists.windows-x86_64.txt index 772027a342..5497fd21be 100644 --- a/library/cpp/CMakeLists.windows-x86_64.txt +++ b/library/cpp/CMakeLists.windows-x86_64.txt @@ -36,6 +36,9 @@ add_subdirectory(disjoint_sets) add_subdirectory(dns) add_subdirectory(enumbitset) add_subdirectory(execprofile) +add_subdirectory(geo) +add_subdirectory(geobase) +add_subdirectory(geohash) add_subdirectory(getopt) add_subdirectory(grpc) add_subdirectory(histogram) @@ -44,9 +47,11 @@ add_subdirectory(http) add_subdirectory(hyperloglog) add_subdirectory(int128) add_subdirectory(ipmath) +add_subdirectory(ipreg) add_subdirectory(ipv6_address) add_subdirectory(iterator) add_subdirectory(json) +add_subdirectory(langmask) add_subdirectory(lcs) add_subdirectory(lfalloc) add_subdirectory(linear_regression) @@ -55,6 +60,7 @@ add_subdirectory(lua) add_subdirectory(lwtrace) add_subdirectory(malloc) add_subdirectory(messagebus) +add_subdirectory(microbdb) add_subdirectory(mime) add_subdirectory(monlib) add_subdirectory(on_disk) @@ -68,6 +74,8 @@ add_subdirectory(random_provider) add_subdirectory(regex) add_subdirectory(resource) add_subdirectory(retry) +add_subdirectory(reverse_geocoder) +add_subdirectory(robots_txt) add_subdirectory(sanitizer) add_subdirectory(scheme) add_subdirectory(sighandler) @@ -90,6 +98,7 @@ add_subdirectory(unified_agent_client) add_subdirectory(uri) add_subdirectory(xml) add_subdirectory(yaml) +add_subdirectory(yconf) add_subdirectory(yson) add_subdirectory(yson_pull) add_subdirectory(yt) diff --git a/library/cpp/containers/CMakeLists.txt b/library/cpp/containers/CMakeLists.txt index 43fcbe8346..40f5013867 100644 --- a/library/cpp/containers/CMakeLists.txt +++ b/library/cpp/containers/CMakeLists.txt @@ -20,5 +20,6 @@ add_subdirectory(ring_buffer) add_subdirectory(sorted_vector) add_subdirectory(stack_array) add_subdirectory(stack_vector) +add_subdirectory(str_hash) add_subdirectory(str_map) add_subdirectory(top_keeper) diff --git a/library/cpp/containers/str_hash/CMakeLists.darwin-x86_64.txt b/library/cpp/containers/str_hash/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..627814f0ed --- /dev/null +++ b/library/cpp/containers/str_hash/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,19 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-containers-str_hash) +target_link_libraries(cpp-containers-str_hash PUBLIC + contrib-libs-cxxsupp + yutil + library-cpp-charset + cpp-containers-str_map +) +target_sources(cpp-containers-str_hash PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/containers/str_hash/str_hash.cpp +) diff --git a/library/cpp/containers/str_hash/CMakeLists.linux-aarch64.txt b/library/cpp/containers/str_hash/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..cd723cbea2 --- /dev/null +++ b/library/cpp/containers/str_hash/CMakeLists.linux-aarch64.txt @@ -0,0 +1,20 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-containers-str_hash) +target_link_libraries(cpp-containers-str_hash PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + library-cpp-charset + cpp-containers-str_map +) +target_sources(cpp-containers-str_hash PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/containers/str_hash/str_hash.cpp +) diff --git a/library/cpp/containers/str_hash/CMakeLists.linux-x86_64.txt b/library/cpp/containers/str_hash/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..cd723cbea2 --- /dev/null +++ b/library/cpp/containers/str_hash/CMakeLists.linux-x86_64.txt @@ -0,0 +1,20 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-containers-str_hash) +target_link_libraries(cpp-containers-str_hash PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + library-cpp-charset + cpp-containers-str_map +) +target_sources(cpp-containers-str_hash PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/containers/str_hash/str_hash.cpp +) diff --git a/library/cpp/containers/str_hash/CMakeLists.txt b/library/cpp/containers/str_hash/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/library/cpp/containers/str_hash/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/library/cpp/containers/str_hash/CMakeLists.windows-x86_64.txt b/library/cpp/containers/str_hash/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..627814f0ed --- /dev/null +++ b/library/cpp/containers/str_hash/CMakeLists.windows-x86_64.txt @@ -0,0 +1,19 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-containers-str_hash) +target_link_libraries(cpp-containers-str_hash PUBLIC + contrib-libs-cxxsupp + yutil + library-cpp-charset + cpp-containers-str_map +) +target_sources(cpp-containers-str_hash PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/containers/str_hash/str_hash.cpp +) diff --git a/library/cpp/containers/str_hash/str_hash.cpp b/library/cpp/containers/str_hash/str_hash.cpp new file mode 100644 index 0000000000..1298638533 --- /dev/null +++ b/library/cpp/containers/str_hash/str_hash.cpp @@ -0,0 +1,60 @@ +#include "str_hash.h" + +#include <library/cpp/charset/ci_string.h> +#include <util/stream/output.h> +#include <util/stream/input.h> + +HashSet::HashSet(const char** array, size_type size) { + Resize(size); + while (*array && **array) + AddPermanent(*array++); +} + +void HashSet::Read(IInputStream* input) { + TString s; + + while (input->ReadLine(s)) { + AddUniq(TCiString(s).c_str()); + } +} + +void HashSet::Write(IOutputStream* output) const { + for (const auto& it : *this) { + *output << it.first << "\n"; + } +} + +#ifdef TEST_STRHASH +#include <ctime> +#include <fstream> +#include <cstdio> +#include <cstdlib> + +using namespace std; + +int main(int argc, char* argv[]) { + if (argc < 2) { + printf("usage: stoplist <stop-words file ...\n"); + exit(EXIT_FAILURE); // FreeBSD: EX_USAGE + } + Hash hash; + hash.Read(cin); + for (--argc, ++argv; argc > 0; --argc, ++argv) { + ifstream input(argv[0]); + if (!input.good()) { + perror(argv[0]); + continue; + } + TCiString s; + while (input >> s) { + if (!hash.Has(s)) + cout << s << "\n"; + else + cout << "[[" << s << "]]" + << "\n"; + } + } + return EXIT_SUCCESS; // EX_OK +} + +#endif diff --git a/library/cpp/containers/str_hash/str_hash.h b/library/cpp/containers/str_hash/str_hash.h new file mode 100644 index 0000000000..25f960dbb5 --- /dev/null +++ b/library/cpp/containers/str_hash/str_hash.h @@ -0,0 +1,181 @@ +#pragma once + +#include <library/cpp/containers/str_map/str_map.h> +#include <library/cpp/charset/ci_string.h> +#include <util/system/yassert.h> +#include <util/memory/tempbuf.h> + +#include <memory> + +class IInputStream; +class IOutputStream; + +template <class T, class Alloc = std::allocator<const char*>> +class Hash; + +struct yvoid { + yvoid() = default; +}; + +template <typename T, class Alloc> +class Hash: public string_hash<T, ci_hash, ci_equal_to, Alloc> { + using ci_string_hash = string_hash<T, ci_hash, ci_equal_to, Alloc>; + +protected: + using ci_string_hash::pool; + +public: + using size_type = typename ci_string_hash::size_type; + using const_iterator = typename ci_string_hash::const_iterator; + using iterator = typename ci_string_hash::iterator; + using value_type = typename ci_string_hash::value_type; + using ci_string_hash::begin; + using ci_string_hash::end; + using ci_string_hash::find; + using ci_string_hash::size; + + Hash() + : ci_string_hash() + { + } + explicit Hash(size_type theSize) + : ci_string_hash(theSize, theSize * AVERAGEWORD_BUF) + { + } + Hash(const char** strings, size_type size = 0, T* = 0); // must end with NULL or "\0" + virtual ~Hash(); + bool Has(const char* s, size_t len, T* pp = nullptr) const; + bool Has(const char* s, T* pp = nullptr) const { + const_iterator it; + if ((it = find(s)) == end()) + return false; + else if (pp) + *pp = (*it).second; + return true; + } + void Add(const char* s, T data) { + // in fact it is the same insert_unique as in AddUnique. + // it's impossible to have _FAST_ version of insert() in 'hash_map' + + // you have to use 'hash_mmap' to get the _kind_ of desired effect. + // BUT still there will be "Checks" inside - + // to make the same keys close to each other (see insert_equal()) + this->insert_copy(s, data); + } + bool AddUniq(const char* s, T data) { + return this->insert_copy(s, data).second; + } + // new function to get rid of allocations completely! -- e.g. in constructors + void AddPermanent(const char* s, T data) { + this->insert(value_type(s, data)); + } + T Detach(const char* s) { + iterator it = find(s); + if (it == end()) + return T(); + T data = (*it).second; + this->erase(it); + return data; + } + size_type NumEntries() const { + return size(); + } + bool ForEach(bool (*func)(const char* key, T data, void* cookie), void* cookie = nullptr); + void Resize(size_type theSize) { + this->reserve(theSize); + // no pool resizing here. + } + virtual void Clear(); + char* Pool() { + if (pool.Size() < 2 || pool.End()[-2] != '\0') + pool.Append("\0", 1); + return pool.Begin(); + } +}; + +template <class T, class Alloc> +Hash<T, Alloc>::Hash(const char** array, size_type theSize, T* data) { + // must end with NULL or "\0" + Y_ASSERT(data != nullptr); + Resize(theSize); + while (*array && **array) + AddPermanent(*array++, *data++); +} + +template <class T, class Alloc> +bool Hash<T, Alloc>::Has(const char* s, size_t len, T* pp) const { + TTempArray<char> buf(len + 1); + char* const allocated = buf.Data(); + memcpy(allocated, s, len); + allocated[len] = '\x00'; + return Has(allocated, pp); +} + +template <class T, class Alloc> +Hash<T, Alloc>::~Hash() { + Clear(); +} + +template <class T, class Alloc> +void Hash<T, Alloc>::Clear() { + ci_string_hash::clear_hash(); // to make the key pool empty +} + +template <class T, class Alloc> +bool Hash<T, Alloc>::ForEach(bool (*func)(const char* key, T data, void* cookie), void* cookie) { + for (const_iterator it = begin(); it != end(); ++it) + if (!func((*it).first, (*it).second, cookie)) + return false; + return true; +} + +class HashSet: public Hash<yvoid> { +public: + HashSet(const char** array, size_type size = 0); + HashSet() + : Hash<yvoid>() + { + } + void Read(IInputStream* input); + void Write(IOutputStream* output) const; + void Add(const char* s) { + // in fact it is the same insert_unique as in AddUnique. + // it's impossible to have _FAST_ version of insert() in 'hash_map' + + // you have to use 'hash_mmap' to get the _kind_ of desired effect. + // BUT still there will be "Checks" inside - + // to make the same keys close to each other (see insert_equal()) + insert_copy(s, yvoid()); + } + bool AddUniq(const char* s) { + return insert_copy(s, yvoid()).second; + } + // new function to get rid of allocations completely! -- e.g. in constructors + void AddPermanent(const char* s) { + insert(value_type(s, yvoid())); + } +}; + +template <class T, class HashFcn = THash<T>, class EqualKey = TEqualTo<T>, class Alloc = std::allocator<T>> +class TStaticHash: private THashMap<T, T, HashFcn, EqualKey> { +private: + using TBase = THashMap<T, T, HashFcn, EqualKey>; + +public: + TStaticHash(T arr[][2], size_t size) { + TBase::reserve(size); + while (size) { + TBase::insert(typename TBase::value_type(arr[0][0], arr[0][1])); + arr++; + size--; + } + } + T operator[](const T& key) const { // !!! it is not lvalue nor it used to be + typename TBase::const_iterator it = TBase::find(key); + if (it == TBase::end()) + return nullptr; + return it->second; + } +}; + +using TStHash = TStaticHash<const char*, ci_hash, ci_equal_to>; diff --git a/library/cpp/containers/str_hash/ya.make b/library/cpp/containers/str_hash/ya.make new file mode 100644 index 0000000000..f7e24316b9 --- /dev/null +++ b/library/cpp/containers/str_hash/ya.make @@ -0,0 +1,12 @@ +LIBRARY() + +PEERDIR( + library/cpp/charset + library/cpp/containers/str_map +) + +SRCS( + str_hash.cpp +) + +END() diff --git a/library/cpp/deprecated/CMakeLists.txt b/library/cpp/deprecated/CMakeLists.txt index ad818e3662..765ea6aad7 100644 --- a/library/cpp/deprecated/CMakeLists.txt +++ b/library/cpp/deprecated/CMakeLists.txt @@ -8,6 +8,10 @@ add_subdirectory(accessors) add_subdirectory(atomic) +add_subdirectory(autoarray) +add_subdirectory(datafile) add_subdirectory(enum_codegen) +add_subdirectory(fgood) add_subdirectory(kmp) +add_subdirectory(mapped_file) add_subdirectory(split) diff --git a/library/cpp/deprecated/autoarray/CMakeLists.darwin-x86_64.txt b/library/cpp/deprecated/autoarray/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..f2a246218c --- /dev/null +++ b/library/cpp/deprecated/autoarray/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-deprecated-autoarray) +target_link_libraries(cpp-deprecated-autoarray PUBLIC + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-deprecated-autoarray PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/autoarray/autoarray.cpp +) diff --git a/library/cpp/deprecated/autoarray/CMakeLists.linux-aarch64.txt b/library/cpp/deprecated/autoarray/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..2411a48cd3 --- /dev/null +++ b/library/cpp/deprecated/autoarray/CMakeLists.linux-aarch64.txt @@ -0,0 +1,18 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-deprecated-autoarray) +target_link_libraries(cpp-deprecated-autoarray PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-deprecated-autoarray PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/autoarray/autoarray.cpp +) diff --git a/library/cpp/deprecated/autoarray/CMakeLists.linux-x86_64.txt b/library/cpp/deprecated/autoarray/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..2411a48cd3 --- /dev/null +++ b/library/cpp/deprecated/autoarray/CMakeLists.linux-x86_64.txt @@ -0,0 +1,18 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-deprecated-autoarray) +target_link_libraries(cpp-deprecated-autoarray PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-deprecated-autoarray PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/autoarray/autoarray.cpp +) diff --git a/library/cpp/deprecated/autoarray/CMakeLists.txt b/library/cpp/deprecated/autoarray/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/library/cpp/deprecated/autoarray/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/library/cpp/deprecated/autoarray/CMakeLists.windows-x86_64.txt b/library/cpp/deprecated/autoarray/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..f2a246218c --- /dev/null +++ b/library/cpp/deprecated/autoarray/CMakeLists.windows-x86_64.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-deprecated-autoarray) +target_link_libraries(cpp-deprecated-autoarray PUBLIC + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-deprecated-autoarray PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/autoarray/autoarray.cpp +) diff --git a/library/cpp/deprecated/autoarray/README.md b/library/cpp/deprecated/autoarray/README.md new file mode 100644 index 0000000000..1d83147cee --- /dev/null +++ b/library/cpp/deprecated/autoarray/README.md @@ -0,0 +1,3 @@ +Pre-C++11 vector-like container. + +Just use std::vector. If you need to fill your vector with custom-constructed data, use reserve+emplace_back (but make sure that your elements are movable). diff --git a/library/cpp/deprecated/autoarray/autoarray.cpp b/library/cpp/deprecated/autoarray/autoarray.cpp new file mode 100644 index 0000000000..15167f27f6 --- /dev/null +++ b/library/cpp/deprecated/autoarray/autoarray.cpp @@ -0,0 +1 @@ +#include "autoarray.h" diff --git a/library/cpp/deprecated/autoarray/autoarray.h b/library/cpp/deprecated/autoarray/autoarray.h new file mode 100644 index 0000000000..2aa12c5916 --- /dev/null +++ b/library/cpp/deprecated/autoarray/autoarray.h @@ -0,0 +1,264 @@ +#pragma once + +#include <util/system/compat.h> +#include <util/system/yassert.h> +#include <util/system/defaults.h> +#include <util/system/sys_alloc.h> + +#include <util/generic/typetraits.h> +#include <utility> + +#include <new> +#include <util/generic/noncopyable.h> + +struct autoarray_getindex { + autoarray_getindex() = default; +}; + +struct aarr_b0 { + aarr_b0() = default; +}; + +struct aarr_nofill { + aarr_nofill() = default; +}; + +template <typename T> +struct ynd_type_traits { + enum { + empty_destructor = TTypeTraits<T>::IsPod, + }; +}; + +template <class T> +class autoarray : TNonCopyable { +protected: + T* arr; + size_t _size; + +private: + void AllocBuf(size_t siz) { + arr = nullptr; + _size = 0; + if (siz) { + arr = (T*)y_allocate(sizeof(T) * siz); + _size = siz; + } + } + +public: + using value_type = T; + using iterator = T*; + using const_iterator = const T*; + + autoarray() + : arr(nullptr) + , _size(0) + { + } + autoarray(size_t siz) { + AllocBuf(siz); + T* curr = arr; + try { + for (T* end = arr + _size; curr != end; ++curr) + new (curr) T(); + } catch (...) { + for (--curr; curr >= arr; --curr) + curr->~T(); + y_deallocate(arr); + throw; + } + } + template <class A> + explicit autoarray(size_t siz, A& fill) { + AllocBuf(siz); + T* curr = arr; + try { + for (T* end = arr + _size; curr != end; ++curr) + new (curr) T(fill); + } catch (...) { + for (--curr; curr >= arr; --curr) + curr->~T(); + y_deallocate(arr); + throw; + } + } + explicit autoarray(size_t siz, autoarray_getindex) { + AllocBuf(siz); + size_t nCurrent = 0; + try { + for (nCurrent = 0; nCurrent < _size; ++nCurrent) + new (&arr[nCurrent]) T(nCurrent); + } catch (...) { + for (size_t n = 0; n < nCurrent; ++n) + arr[n].~T(); + y_deallocate(arr); + throw; + } + } + explicit autoarray(size_t siz, aarr_b0) { + AllocBuf(siz); + memset(arr, 0, _size * sizeof(T)); + } + explicit autoarray(size_t siz, aarr_nofill) { + AllocBuf(siz); + } + template <class A> + explicit autoarray(const A* fill, size_t siz) { + AllocBuf(siz); + size_t nCurrent = 0; + try { + for (nCurrent = 0; nCurrent < _size; ++nCurrent) + new (&arr[nCurrent]) T(fill[nCurrent]); + } catch (...) { + for (size_t n = 0; n < nCurrent; ++n) + arr[n].~T(); + y_deallocate(arr); + throw; + } + } + template <class A, class B> + explicit autoarray(const A* fill, const B* cfill, size_t siz) { + AllocBuf(siz); + size_t nCurrent = 0; + try { + for (nCurrent = 0; nCurrent < _size; ++nCurrent) + new (&arr[nCurrent]) T(fill[nCurrent], cfill); + } catch (...) { + for (size_t n = 0; n < nCurrent; ++n) + arr[n].~T(); + y_deallocate(arr); + throw; + } + } + template <class A> + explicit autoarray(const A* fill, size_t initsiz, size_t fullsiz) { + AllocBuf(fullsiz); + size_t nCurrent = 0; + try { + for (nCurrent = 0; nCurrent < ((initsiz < _size) ? initsiz : _size); ++nCurrent) + new (&arr[nCurrent]) T(fill[nCurrent]); + for (; nCurrent < _size; ++nCurrent) + new (&arr[nCurrent]) T(); + } catch (...) { + for (size_t n = 0; n < nCurrent; ++n) + arr[n].~T(); + y_deallocate(arr); + throw; + } + } + template <class A> + explicit autoarray(const A* fill, size_t initsiz, size_t fullsiz, const T& dummy) { + AllocBuf(fullsiz); + size_t nCurrent = 0; + try { + for (nCurrent = 0; nCurrent < ((initsiz < _size) ? initsiz : _size); ++nCurrent) + new (&arr[nCurrent]) T(fill[nCurrent]); + for (; nCurrent < _size; ++nCurrent) + new (&arr[nCurrent]) T(dummy); + } catch (...) { + for (size_t n = 0; n < nCurrent; ++n) + arr[n].~T(); + y_deallocate(arr); + throw; + } + } + + template <class... R> + explicit autoarray(size_t siz, R&&... fill) { + AllocBuf(siz); + T* curr = arr; + try { + for (T* end = arr + _size; curr != end; ++curr) + new (curr) T(std::forward<R>(fill)...); + } catch (...) { + for (--curr; curr >= arr; --curr) + curr->~T(); + y_deallocate(arr); + throw; + } + } + ~autoarray() { + if (_size) { + if (!ynd_type_traits<T>::empty_destructor) + for (T *curr = arr, *end = arr + _size; curr != end; ++curr) + curr->~T(); + y_deallocate(arr); + } + } + T& operator[](size_t pos) { + Y_ASSERT(pos < _size); + return arr[pos]; + } + const T& operator[](size_t pos) const { + Y_ASSERT(pos < _size); + return arr[pos]; + } + size_t size() const { + return _size; + } + void swap(autoarray& with) { + T* tmp_arr = arr; + size_t tmp_size = _size; + arr = with.arr; + _size = with._size; + with.arr = tmp_arr; + with._size = tmp_size; + } + void resize(size_t siz) { + autoarray<T> tmp(arr, _size, siz); + swap(tmp); + } + void resize(size_t siz, const T& dummy) { + autoarray<T> tmp(arr, _size, siz, dummy); + swap(tmp); + } + T* rawpointer() { + return arr; + } + const T* operator~() const { + return arr; + } + T* begin() { + return arr; + } + T* end() { + return arr + _size; + } + T& back() { + Y_ASSERT(_size); + return arr[_size - 1]; + } + bool empty() const { + return !_size; + } + bool operator!() const { + return !_size; + } + size_t operator+() const { + return _size; + } + const T* begin() const { + return arr; + } + const T* end() const { + return arr + _size; + } + const T& back() const { + Y_ASSERT(_size); + return arr[_size - 1]; + } + //operator T*() { return arr; } +}; + +template <class T> +inline bool operator==(const autoarray<T>& a, const autoarray<T>& b) { + size_t count = a.size(); + if (count != b.size()) + return false; + for (size_t i = 0; i < count; ++i) { + if (a[i] != b[i]) + return false; + } + return true; +} diff --git a/library/cpp/deprecated/autoarray/ya.make b/library/cpp/deprecated/autoarray/ya.make new file mode 100644 index 0000000000..4b055f8c29 --- /dev/null +++ b/library/cpp/deprecated/autoarray/ya.make @@ -0,0 +1,7 @@ +LIBRARY() + +SRCS( + autoarray.cpp +) + +END() diff --git a/library/cpp/deprecated/datafile/CMakeLists.darwin-x86_64.txt b/library/cpp/deprecated/datafile/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..3f88f788da --- /dev/null +++ b/library/cpp/deprecated/datafile/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,19 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-deprecated-datafile) +target_link_libraries(cpp-deprecated-datafile PUBLIC + contrib-libs-cxxsupp + yutil + cpp-deprecated-mapped_file +) +target_sources(cpp-deprecated-datafile PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/datafile/datafile.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/datafile/loadmode.cpp +) diff --git a/library/cpp/deprecated/datafile/CMakeLists.linux-aarch64.txt b/library/cpp/deprecated/datafile/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..43da9ae45a --- /dev/null +++ b/library/cpp/deprecated/datafile/CMakeLists.linux-aarch64.txt @@ -0,0 +1,20 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-deprecated-datafile) +target_link_libraries(cpp-deprecated-datafile PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + cpp-deprecated-mapped_file +) +target_sources(cpp-deprecated-datafile PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/datafile/datafile.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/datafile/loadmode.cpp +) diff --git a/library/cpp/deprecated/datafile/CMakeLists.linux-x86_64.txt b/library/cpp/deprecated/datafile/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..43da9ae45a --- /dev/null +++ b/library/cpp/deprecated/datafile/CMakeLists.linux-x86_64.txt @@ -0,0 +1,20 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-deprecated-datafile) +target_link_libraries(cpp-deprecated-datafile PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + cpp-deprecated-mapped_file +) +target_sources(cpp-deprecated-datafile PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/datafile/datafile.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/datafile/loadmode.cpp +) diff --git a/library/cpp/deprecated/datafile/CMakeLists.txt b/library/cpp/deprecated/datafile/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/library/cpp/deprecated/datafile/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/library/cpp/deprecated/datafile/CMakeLists.windows-x86_64.txt b/library/cpp/deprecated/datafile/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..3f88f788da --- /dev/null +++ b/library/cpp/deprecated/datafile/CMakeLists.windows-x86_64.txt @@ -0,0 +1,19 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-deprecated-datafile) +target_link_libraries(cpp-deprecated-datafile PUBLIC + contrib-libs-cxxsupp + yutil + cpp-deprecated-mapped_file +) +target_sources(cpp-deprecated-datafile PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/datafile/datafile.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/datafile/loadmode.cpp +) diff --git a/library/cpp/deprecated/datafile/README.md b/library/cpp/deprecated/datafile/README.md new file mode 100644 index 0000000000..7f8547108e --- /dev/null +++ b/library/cpp/deprecated/datafile/README.md @@ -0,0 +1,3 @@ +A wrapper on top of some user-defined custom file format. + +Just write your own if you need it. It's going to be way easier than figuring out how to use this one. diff --git a/library/cpp/deprecated/datafile/datafile.cpp b/library/cpp/deprecated/datafile/datafile.cpp new file mode 100644 index 0000000000..ff93f11c6b --- /dev/null +++ b/library/cpp/deprecated/datafile/datafile.cpp @@ -0,0 +1,42 @@ +#include "datafile.h" + +void TDataFileBase::DoLoad(const char* fname, int loadMode) { + Destroy(); + TFile f(fname, RdOnly); + DoLoad(f, loadMode, nullptr, 0); +} + +void TDataFileBase::DoLoad(TFile& f, int loadMode, void* hdrPtr, size_t hdrSize) { + if (hdrPtr) { + if (loadMode & DLM_EXACT_SIZE && f.GetLength() != (i64)Length) + throw yexception() << f.GetName() << " size does not match its header value"; + } else { + Length = f.GetLength(); + hdrSize = 0; + } + if ((loadMode & DLM_LD_TYPE_MASK) == DLM_READ) { + MemData = TVector<char>(Length); + memcpy(MemData.begin(), hdrPtr, hdrSize); + f.Load(MemData.begin() + hdrSize, Length - hdrSize); + Start = MemData.begin(); + } else { + FileData.init(f); + if (FileData.getSize() < Length) + throw yexception() << f.GetName() << " is smaller than what its header value says"; + if ((loadMode & DLM_LD_TYPE_MASK) == DLM_MMAP_PRC) + FileData.precharge(); + Start = (const char*)FileData.getData(); + } +} + +void TDataFileBase::Destroy() { + TVector<char>().swap(MemData); + FileData.term(); + Start = nullptr; + Length = 0; +} + +void TDataFileBase::Precharge() const { + if (Length && Start == (char*)FileData.getData()) + FileData.precharge(); +} diff --git a/library/cpp/deprecated/datafile/datafile.h b/library/cpp/deprecated/datafile/datafile.h new file mode 100644 index 0000000000..a438baceca --- /dev/null +++ b/library/cpp/deprecated/datafile/datafile.h @@ -0,0 +1,88 @@ +#pragma once + +#include "loadmode.h" + +#include <library/cpp/deprecated/mapped_file/mapped_file.h> + +#include <util/generic/vector.h> +#include <util/system/file.h> +#include <util/system/filemap.h> + +/** Simple helper that allows a file to be either mapped or read into malloc'ed memory. + This behaviour is controlled by EDataLoadMode enum defined in loadmode.h. + Unlike TBlob it provides Precharge() function and simple file size - based integrity check. + + To use this code, inherit your class from TDataFile<TFileHeader>. + TFileHeader must be a pod-type structure with byte layout of the file header. + File must start with that header. + TFileHeader must have FileSize() member function that determines expected file size or + length of data that need to be read from the beginning of file. + */ + +class TDataFileBase { +protected: + TVector<char> MemData; + TMappedFile FileData; + + const char* Start; + size_t Length; + + TDataFileBase() + : Start(nullptr) + , Length(0) + { + } + + void DoLoad(TFile& f, int loadMode, void* hdrPtr, size_t hdrSize); + void DoLoad(const char* fname, int loadMode); // just whole file + void Destroy(); + void swap(TDataFileBase& with) { + MemData.swap(with.MemData); + FileData.swap(with.FileData); + DoSwap(Start, with.Start); + DoSwap(Length, with.Length); + } + +public: + void Precharge() const; +}; + +template <class TFileHeader> +class TDataFile: public TDataFileBase { +protected: + void Load(const char* fname, EDataLoadMode loadMode) { + Destroy(); + TFile f(fname, RdOnly | Seq); + TFileHeader hdr; + f.Load(&hdr, sizeof(hdr)); + Length = hdr.FileSize(); + DoLoad(f, (int)loadMode, &hdr, sizeof(hdr)); + } + const TFileHeader& Hdr() const { + return *(TFileHeader*)Start; + } +}; + +// Use: class TFoo: public TDataFileEx<Foo> {...}; +// Additional requrement: TFileHeader must have Validate(fname) function that throws exception. +// Class TUser itself must have Init(fname) function +// Adds Load() function to your class (TUser) +template <class TUser, class TFileHeader> +class TDataFileEx: public TDataFile<TFileHeader> { +private: + using TBase = TDataFile<TFileHeader>; + TUser& User() const { + return *(TUser*)this; + } + +public: + TDataFileEx(const char* fname, EDataLoadMode loadMode = DLM_DEFAULT) { + if (fname) + Load(fname, loadMode); + } + void Load(const char* fname, EDataLoadMode loadMode = DLM_DEFAULT) { + TBase::Load(fname, loadMode); + TBase::Hdr().Validate(fname); + User().Init(fname); + } +}; diff --git a/library/cpp/deprecated/datafile/loadmode.cpp b/library/cpp/deprecated/datafile/loadmode.cpp new file mode 100644 index 0000000000..a857830326 --- /dev/null +++ b/library/cpp/deprecated/datafile/loadmode.cpp @@ -0,0 +1 @@ +#include "loadmode.h" diff --git a/library/cpp/deprecated/datafile/loadmode.h b/library/cpp/deprecated/datafile/loadmode.h new file mode 100644 index 0000000000..f04054dd64 --- /dev/null +++ b/library/cpp/deprecated/datafile/loadmode.h @@ -0,0 +1,20 @@ +#pragma once + +// It is recommended to support all reasonal value combinations via this enum, +// to let Load() function argument be of EDataLoadMode type, not just int type + +enum EDataLoadMode { + DLM_READ = 0, + DLM_MMAP_PRC = 1, // precharge + DLM_MMAP = 2, // w/o precharge + DLM_MMAP_AUTO_PRC = 3, // precharge automatically (same as DLM_MMAP unless specifically supported) + DLM_LD_TYPE_MASK = 15, + DLM_EXACT_SIZE = 16, // fail if input file is larger than what header says + + DLM_READ_ESZ = DLM_READ | DLM_EXACT_SIZE, + DLM_MMAP_PRC_ESZ = DLM_MMAP_PRC | DLM_EXACT_SIZE, + DLM_MMAP_ESZ = DLM_MMAP | DLM_EXACT_SIZE, + DLM_MMAP_APRC_ESZ = DLM_MMAP_AUTO_PRC | DLM_EXACT_SIZE, + + DLM_DEFAULT = DLM_MMAP_PRC_ESZ, +}; diff --git a/library/cpp/deprecated/datafile/ya.make b/library/cpp/deprecated/datafile/ya.make new file mode 100644 index 0000000000..1ad4fe9bc7 --- /dev/null +++ b/library/cpp/deprecated/datafile/ya.make @@ -0,0 +1,12 @@ +LIBRARY() + +SRCS( + datafile.cpp + loadmode.cpp +) + +PEERDIR( + library/cpp/deprecated/mapped_file +) + +END() diff --git a/library/cpp/deprecated/fgood/CMakeLists.darwin-x86_64.txt b/library/cpp/deprecated/fgood/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..a82750e559 --- /dev/null +++ b/library/cpp/deprecated/fgood/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,18 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-deprecated-fgood) +target_link_libraries(cpp-deprecated-fgood PUBLIC + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-deprecated-fgood PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/fgood/ffb.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/fgood/fgood.cpp +) diff --git a/library/cpp/deprecated/fgood/CMakeLists.linux-aarch64.txt b/library/cpp/deprecated/fgood/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..52e29348fd --- /dev/null +++ b/library/cpp/deprecated/fgood/CMakeLists.linux-aarch64.txt @@ -0,0 +1,19 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-deprecated-fgood) +target_link_libraries(cpp-deprecated-fgood PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-deprecated-fgood PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/fgood/ffb.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/fgood/fgood.cpp +) diff --git a/library/cpp/deprecated/fgood/CMakeLists.linux-x86_64.txt b/library/cpp/deprecated/fgood/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..52e29348fd --- /dev/null +++ b/library/cpp/deprecated/fgood/CMakeLists.linux-x86_64.txt @@ -0,0 +1,19 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-deprecated-fgood) +target_link_libraries(cpp-deprecated-fgood PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-deprecated-fgood PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/fgood/ffb.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/fgood/fgood.cpp +) diff --git a/library/cpp/deprecated/fgood/CMakeLists.txt b/library/cpp/deprecated/fgood/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/library/cpp/deprecated/fgood/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/library/cpp/deprecated/fgood/CMakeLists.windows-x86_64.txt b/library/cpp/deprecated/fgood/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..a82750e559 --- /dev/null +++ b/library/cpp/deprecated/fgood/CMakeLists.windows-x86_64.txt @@ -0,0 +1,18 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-deprecated-fgood) +target_link_libraries(cpp-deprecated-fgood PUBLIC + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-deprecated-fgood PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/fgood/ffb.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/fgood/fgood.cpp +) diff --git a/library/cpp/deprecated/fgood/README.md b/library/cpp/deprecated/fgood/README.md new file mode 100644 index 0000000000..4f66289657 --- /dev/null +++ b/library/cpp/deprecated/fgood/README.md @@ -0,0 +1,15 @@ +Some ancient wrappers on top of FILE*, and some string manupulation functions. + +Alternatives are as follows. + +For TFILEPtr. Use TIFStream or TOFStream if you need IO. For some rare use cases a TFileMap might also do. + +For fput/fget/getline. Use streams API. + +For struct ffb and struct prnstr. Just don't use them. Even if you can figure out what they do. + +For sf family of functions and TLineSplitter. Just use Split* from util/string/split.h + +For TSFReader. Use TMapTsvFile. + +For read_or_die family of functions. Use streams API. diff --git a/library/cpp/deprecated/fgood/ffb.cpp b/library/cpp/deprecated/fgood/ffb.cpp new file mode 100644 index 0000000000..aa9da861a6 --- /dev/null +++ b/library/cpp/deprecated/fgood/ffb.cpp @@ -0,0 +1,407 @@ +#include "ffb.h" + +#include <util/string/util.h> // str_spn +#include <util/system/compat.h> +#include <util/generic/yexception.h> + +#include <cstdio> +#include <algorithm> + +#include <ctype.h> + +#ifdef _win_ +#include <io.h> +#else +#include <unistd.h> +#endif + +ffb::ffb(FILE* file) + : TFILEPtr(file) +{ + if (file && !isatty(fileno(file)) && BUFSIZ < 512 * 1024) + setvbuf(file, nullptr, _IOFBF, 512 * 1024); +} + +void ffb::operator=(FILE* f) { + TFILEPtr::operator=(f); + if (f && !isatty(fileno(f)) && BUFSIZ < 512 * 1024) + setvbuf(f, nullptr, _IOFBF, 512 * 1024); +} + +void ffb::open(const char* name, const char* mode) { + TFILEPtr::open(name, mode); + if (!isatty(fileno(*this)) && BUFSIZ < 512 * 1024) + setvbuf(*this, nullptr, _IOFBF, 512 * 1024); +} + +int sf(char** fb, char* buf) { //don't want to call sf(fb, buf, 32) + if (!(*buf && *buf != 10)) { + *fb = nullptr; + return 0; + } + int n = 1; + fb[0] = buf; + while (*buf && *buf != 10 && n < 31) { + if (*buf == '\t') { + *buf++ = 0; + fb[n++] = buf; + continue; + } + buf++; + } + if (*buf == 10 && buf[-1] == 13) + buf[-1] = 0; + *buf = 0; + fb[n] = nullptr; + return n; +} + +int sf(char** fb, char* buf, size_t fb_sz) { + if (!(*buf && *buf != 10)) { + *fb = nullptr; + return 0; + } + fb_sz--; + int n = 1; + fb[0] = buf; + while (*buf && *buf != 10 && n < (int)fb_sz) { + if (*buf == '\t') { + *buf++ = 0; + fb[n++] = buf; + continue; + } + buf++; + } + if (*buf == 10 && buf[-1] == 13) + buf[-1] = 0; + *buf = 0; + fb[n] = nullptr; + return n; +} + +inline int sf_blank(char** fb, char* buf, size_t fb_sz) { + while (isspace((ui8)*buf)) + buf++; + if (!*buf) { + *fb = nullptr; + return 0; + } + fb_sz--; + int n = 1; + fb[0] = buf; + while (*buf && *buf != 10 && n < (int)fb_sz) { + if (isspace((ui8)*buf)) { + *buf++ = 0; + while (isspace((ui8)*buf)) + buf++; + if (*buf) + fb[n++] = buf; + continue; + } + buf++; + } + if (*buf == 10 && buf[-1] == 13) + buf[-1] = 0; + *buf = 0; + fb[n] = nullptr; + return n; +} + +int sf(char fs, char** fb, char* buf, size_t fb_sz) { + if (fs == ' ') + return sf_blank(fb, buf, fb_sz); + while (*buf == fs) + buf++; + if (!(*buf && *buf != 10)) { + *fb = nullptr; + return 0; + } + fb_sz--; + int n = 1; + fb[0] = buf; + while (*buf && *buf != 10 && n < (int)fb_sz) { + if (*buf == fs) { + *buf++ = 0; + while (*buf == fs) + buf++; + fb[n++] = buf; + continue; + } + buf++; + } + if (*buf == 10 && buf[-1] == 13) + buf[-1] = 0; + *buf = 0; + fb[n] = nullptr; + return n; +} + +int sf(const char* fs, char** fb, char* buf, size_t fb_sz) { + if (!(*buf && *buf != 10)) { + *fb = nullptr; + return 0; + } + int fs_len = strlen(fs); + fb_sz--; + int n = 1; + fb[0] = buf; + while (*buf && *buf != 10 && n < (int)fb_sz) { + if (*buf == *fs && !strncmp(buf + 1, fs + 1, fs_len - 1)) { + *buf = 0; + buf += fs_len; + fb[n++] = buf; + continue; + } + buf++; + } + if (*buf == 10 && buf[-1] == 13) + buf[-1] = 0; + *buf = 0; + fb[n] = nullptr; + return n; +} + +inline bool is_end(const char* p) { + return !p || !p[0]; +} + +int sf(const char* seps, char* buf, char** fb, size_t fb_sz) { + if (fb_sz < 1 || is_end(buf)) { + *fb = nullptr; + return 0; + } + str_spn sseps(seps); + fb[0] = nullptr; + int n = 0; + // skip leading delimeters + buf = sseps.cbrk(buf); + if (is_end(buf)) + return 0; + // store fields + while (n < (int)fb_sz) { + fb[n++] = buf; + // find delimeters + buf = sseps.brk(buf + 1); + if (is_end(buf)) + break; + *buf = 0; + // skip delimiters + buf = sseps.cbrk(buf + 1); + if (is_end(buf)) + break; + } + fb[n] = nullptr; + return n; +} + +void TLineSplitter::operator()(char* p, TVector<char*>& fields) const { + if (!p || !*p) + return; + char* q = p; + while (1) { + p = Sep.brk(p); + if (q && (p - q || !SkipEmpty())) + fields.push_back(q); + q = nullptr; + if (!*p) + break; + if (SepStrLen == 1 || (SepStrLen > 1 && !strncmp(p + 1, SepStr + 1, SepStrLen - 1))) { + *p = 0; + p += SepStrLen; + q = p; + } else + p++; + } +} + +void TLineSplitter::operator()(const char* p, TVector<std::pair<const char*, size_t>>& fields) const { + if (!p || !*p) + return; + const char* q = p; + while (1) { + p = Sep.brk(p); + if (q && (p - q || !SkipEmpty())) + fields.push_back(std::make_pair(q, p - q)); + q = nullptr; + if (!*p) + break; + if (SepStrLen == 1 || (SepStrLen > 1 && !strncmp(p + 1, SepStr + 1, SepStrLen - 1))) { + p += SepStrLen; + q = p; + } else + p++; + } +} + +TSFReader::TSFReader(const char* fname, char sep, i32 nfrq) // if sep == ' ' isspace will be imitated (for compat) + : Split(str_spn(sep == ' ' ? "\t\n\v\f\r " : TString(1, sep).data()), sep == ' ') + , OpenPipe(false) +{ + Open(fname, nfrq); +} + +TSFReader::TSFReader(const char* fname, const char* sep, i32 nfrq) + : Split(sep, false) + , OpenPipe(false) +{ + Open(fname, nfrq); +} + +TSFReader::TSFReader(const char* fname, const TLineSplitter& spl, i32 nfrq) + : Split(spl) + , OpenPipe(false) +{ + Open(fname, nfrq); +} + +void TSFReader::Open(const char* fname, i32 nfrq, size_t vbuf_size) { + FieldsRequired = nfrq; + NF = NR = 0; + + if (IsOpen()) + File.close(); + + if (!fname) + return; + + if (!strcmp(fname, "/dev/stdin")) { + File.assign(stdin, "/dev/stdin"); + } else { + if (OpenPipe) + File.popen(fname, "r"); + else + File.open(fname, "r"); + } + OpenPipe = false; + if (!isatty(fileno(File))) + setvbuf(File, nullptr, _IOFBF, vbuf_size); +} + +void TSFReader::Popen(const char* pname, i32 nfrq, size_t vbuf_size) { + OpenPipe = true; + Open(pname, nfrq, vbuf_size); +} + +bool TSFReader::NextLine(segmented_string_pool* pool) { + size_t line_len = 0; + +#ifdef __FreeBSD__ + char* ptr = fgetln(File, &line_len); + if (!ptr) + return false; + if (!line_len || ptr[line_len - 1] != '\n') { // last line w/o newline + Buf.AssignNoAlias(ptr, line_len); + ptr = Buf.begin(); + } else { + // can safely replace newline with \0 + ptr[line_len - 1] = 0; + --line_len; + } +#else + if (!getline(File, Buf)) + return false; + char* ptr = Buf.begin(); + line_len = Buf.size(); +#endif + if (line_len && ptr[line_len - 1] == '\r') + ptr[line_len - 1] = 0; + + if (pool) { + char* nptr = pool->append(ptr); + Y_ASSERT(!strcmp(ptr, nptr)); + ptr = nptr; + } + + ++NR; + Fields.clear(); + Split(ptr, Fields); + NF = Fields.size(); + + if (FieldsRequired != -1 && FieldsRequired != (int)NF) + ythrow yexception() << File.name() << " line " << NR << ": " << NF << " fields, expected " << FieldsRequired; + + return true; +} + +int prnstr::f(const char* c, ...) { + va_list params; + int n = asize - pos, k; + va_start(params, c); + while ((k = vsnprintf(buf + pos, n, c, params)) >= n) { + n += asize, asize *= 2; + while (k + pos >= n) + n += asize, asize *= 2; + char* t = new char[asize]; + memcpy(t, buf, pos); + delete[] buf; + buf = t; + va_end(params); + va_start(params, c); + } + pos += k; + va_end(params); + return k; +} +int prnstr::s(const char* c, size_t k) { + if (!c) + return 0; + size_t n = asize - pos; + if (k >= n) { + n += asize, asize *= 2; + while (k + pos >= n) + n += asize, asize *= 2; + char* t = new char[asize]; + memcpy(t, buf, pos); + delete[] buf; + buf = t; + } + memcpy(buf + pos, c, k); + pos += k; + buf[pos] = 0; + return k; +} +void prnstr::clear() { + pos = 0; + if (asize > 32768) { + asize = 32768; + delete[] buf; + buf = new char[asize]; + } +} + +void prnstr::swap(prnstr& w) { + std::swap(buf, w.buf); + std::swap(pos, w.pos); + std::swap(asize, w.asize); +} + +FILE* read_or_die(const char* fname) { + FILE* f = fopen(fname, "rb"); + if (!f) + err(1, "%s", fname); + return f; +} +FILE* write_or_die(const char* fname) { + FILE* f = fopen(fname, "wb"); + if (!f) + err(1, "%s", fname); + return f; +} +FILE* fopen_or_die(const char* fname, const char* mode) { + FILE* f = fopen(fname, mode); + if (!f) + err(1, "%s (mode '%s')", fname, mode); + return f; +} + +FILE* fopen_chk(const char* fname, const char* mode) { + FILE* f = fopen(fname, mode); + if (!f) + ythrow yexception() << fname << " (mode '" << mode << "'): " << LastSystemErrorText(); + return f; +} + +void fclose_chk(FILE* f, const char* fname) { + if (fclose(f)) + ythrow yexception() << "file " << fname << ": " << LastSystemErrorText(); +} diff --git a/library/cpp/deprecated/fgood/ffb.h b/library/cpp/deprecated/fgood/ffb.h new file mode 100644 index 0000000000..ca229eb65a --- /dev/null +++ b/library/cpp/deprecated/fgood/ffb.h @@ -0,0 +1,264 @@ +#pragma once + +#include "fgood.h" + +#include <util/string/util.h> // str_spn +#include <util/string/split.h> // str_spn +#include <util/memory/segmented_string_pool.h> +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/generic/noncopyable.h> + +#include <utility> + +#include <cstdarg> +#include <cstring> + +struct ffb: public TFILEPtr { + ffb() { + } + ffb(FILE* file); + ffb(const char* name, const char* mode) { + open(name, mode); + } + void operator=(FILE* f); // take ownership + void open(const char* name, const char* mode); + int f(const char* c, ...) { + va_list args; + va_start(args, c); + return vfprintf(*this, c, args); + } + void s(const char* c) { + fsput(c, strlen(c)); + } + void b(const void* cc, int n) { + fsput((const char*)cc, n); + } + void B(const void* cc, int N) { + fsput((const char*)cc, N); + } + void c(char c) { + fputc(c); + } + void cbe(wchar16 c) { // big endian utf-16 + fputc(char(c >> 8)); //Hi8 + fputc(char(c & 255)); //Lo8 + } + void sbe(const wchar16* c) { + for (; *c; c++) + cbe(*c); + } + void fclose() { + close(); + } +}; + +// split fields of tab-delimited line of text +// here and below fb actual size must be fb_sz + 1 to allow fb[fb_sz] be zero +int sf(char** fb, char* buf, size_t fb_sz); +int sf(char** fb, char* buf /* fb_sz == 32 */); + +// split fields of char-delimited line of text +// Achtung: delim = ' ' imitates awk: initial separators are skipped, +// repeated seps treated as one, all chars less than ' ' treated as separators. +int sf(char fs, char** fb, char* buf, size_t fb_sz = 32); + +// split fields of string-delimited line of text (fs is NOT a regexp) +// (usually fs is "@@") +int sf(const char* fs, char** fb, char* buf, size_t fb_sz = 32); + +// split fields of char-delimited line of text, set of char-separators is given +// Achtung: repeated seps treated as one, initial seps are skipped +// newlines are NOT ignored. +int sf(const char* seps, char* buf, char** fb, size_t fb_sz = 32); + +inline char* chomp(char* buf) { + char* c = buf + strlen(buf); + if (c > buf && c[-1] == '\n') { + *--c = 0; +#ifdef _win32_ + if (c > buf && c[-1] == '\r') + *--c = 0; +#endif + } + return buf; +} + +inline char* chomp_cr(char* buf) { + char* c = buf + strlen(buf); + if (c > buf && c[-1] == '\n') + *--c = 0; + if (c > buf && c[-1] == '\r') + *--c = 0; + return buf; +} + +class TLineSplitter { +protected: + enum { // Default: Split string by SepStr + SplitByAnySep = 1, // Split string by Sep + NoEmptyFields = 2 // Skip all empty fields between separators + }; + +private: + ui32 Flags; + const str_spn Sep; // collection of separators + const char* SepStr; // pointer exact string to separate by + size_t SepStrLen; // length of separator string + +public: + TLineSplitter(const char* sep, bool noEmpty) + : Flags(noEmpty ? NoEmptyFields : 0) + , Sep(TString(sep, 1).data()) + , SepStr(sep) + , SepStrLen(strlen(sep)) + { + } + TLineSplitter(const str_spn& sep, bool noEmpty = false) + : Flags(SplitByAnySep | (noEmpty ? NoEmptyFields : 0)) + , Sep(sep) + , SepStr(nullptr) + , SepStrLen(1) + { + } + bool AnySep() const { + return Flags & SplitByAnySep; + } + bool SkipEmpty() const { + return Flags & NoEmptyFields; + } + /// Separates string onto tokens + /// Expecting a zero-terminated string + /// By default returns empty fields between sequential separators + void operator()(char* p, TVector<char*>& fields) const; + /// Same, but for const string - fills vector of pairs (pointer, length) + void operator()(const char* p, TVector<std::pair<const char*, size_t>>& fields) const; +}; + +/** + * Use library/cpp/map_text_file/map_tsv_file.h instead. + */ +class TSFReader { + TString Buf; // buffer used for non-'\n'-terminated string and for non-freebsd work + TLineSplitter Split; + TVector<char*> Fields; + size_t NF; // Fields.size() + size_t NR; + + TFILEPtr File; + + bool OpenPipe; // internal flag that turns open() to popen() + + i32 FieldsRequired; // if != -1, != nf, terminate program + +public: + // char separator + // Achtung: delim = ' ' imitates awk: initial separators are skipped, + // all chars less than ' ' treated as separators. + TSFReader(const char* fname = nullptr, char sep = '\t', i32 nf_reqired = -1); + // exact string separator + TSFReader(const char* fname, const char* sep, i32 nf_reqired = -1); + // fully customizable + TSFReader(const char* fname, const TLineSplitter& spl, i32 nf_reqired = -1); + + void Open(const char* fname, i32 nf_reqired = -1, size_t vbufsize = 1u << 21); // use "/dev/stdin" for stdin + void Popen(const char* pname, i32 nf_reqired = -1, size_t vbufsize = 1u << 21); + + bool NextLine(segmented_string_pool* pool = nullptr); + + bool IsOpen() const { + return (FILE*)File != nullptr; + } + bool IsEof() const { + return feof(File); + } + void Close() { + File.close(); + } + void Rewind() { + File.seek(0, SEEK_SET); + } + void Seek(i64 offset, int mode = SEEK_SET) { + File.seek(offset, mode); + } + i64 Tell() const { + return ftell(File); + } + char*& operator[](size_t ind) { + //if (ind >= NF) + // throw yexception("Can't return reference to unexisting field %" PRISZT, ind); + return Fields[ind]; + } + const char* operator[](size_t ind) const { + if (ind >= NF) + return nullptr; + return Fields[ind]; + } + operator int() const { // note: empty input line makes 0 fields + return (int)NF; + } + const char* Name() const { + return File.name().data(); + } + size_t Line() const { + return NR; + } + const TVector<char*>& GetFields() const { + return Fields; + } +}; + +struct prnstr { + char* buf; + int pos; + int asize; + prnstr() + : pos(0) + { + asize = 32; + buf = new char[asize]; + } + explicit prnstr(int asz) + : pos(0) + { + asize = asz; + buf = new char[asize]; + } + int f(const char* c, ...); + int s(const char* c1, const char* c2); + int s(const char* c1, const char* c2, const char* c3); + int s(const char* c, size_t len); + //int s(const char *c); + int s(const char* c) { + return c ? s(c, strlen(c)) : 0; + } + int s(const TString& c); + int s_htmesc(const char* c, bool enc_utf = false); + int s_htmesc_w(const char* c); + int c(char c); + int cu(wchar32 c); //for utf-8 + void restart() { + *buf = 0; + pos = 0; + } + const char* operator~() const { + return buf; + } + int operator+() const { + return pos; + } + ~prnstr() { + delete[] buf; + } + void clear(); + void swap(prnstr& w); +}; + +// functions that terminate program upon failure +FILE* read_or_die(const char* fname); +FILE* write_or_die(const char* fname); +FILE* fopen_or_die(const char* fname, const char* mode); + +// functions that throw upon failure +FILE* fopen_chk(const char* fname, const char* mode); +void fclose_chk(FILE* f, const char* fname_dbg); diff --git a/library/cpp/deprecated/fgood/fgood.cpp b/library/cpp/deprecated/fgood/fgood.cpp new file mode 100644 index 0000000000..5d4725bfae --- /dev/null +++ b/library/cpp/deprecated/fgood/fgood.cpp @@ -0,0 +1,70 @@ +#include "fgood.h" + +#include <util/generic/cast.h> +#include <util/string/cast.h> +#include <util/system/fstat.h> + +#ifdef _win32_ +#include <io.h> +#endif + +i64 TFILEPtr::length() const { +#ifdef _win32_ + FHANDLE fd = (FHANDLE)_get_osfhandle(fileno(m_file)); +#else + FHANDLE fd = fileno(m_file); +#endif + i64 rv = GetFileLength(fd); + if (rv < 0) + ythrow yexception() << "TFILEPtr::length() " << Name.data() << ": " << LastSystemErrorText(); + return rv; +} + +FILE* OpenFILEOrFail(const TString& name, const char* mode) { + FILE* res = ::fopen(name.data(), mode); + if (!res) { + ythrow yexception() << "can't open \'" << name << "\' with mode \'" << mode << "\': " << LastSystemErrorText(); + } + return res; +} + +void TFILECloser::Destroy(FILE* file) { + ::fclose(file); +} + +#ifdef _freebsd_ // fgetln +#define getline getline_alt_4test +#endif // _freebsd_ + +bool getline(TFILEPtr& f, TString& s) { + char buf[4096]; + char* buf_ptr; + if (s.capacity() > sizeof(buf)) { + s.resize(s.capacity()); + if ((buf_ptr = fgets(s.begin(), IntegerCast<int>(s.capacity()), f)) == nullptr) + return false; + } else { + if ((buf_ptr = fgets(buf, sizeof(buf), f)) == nullptr) + return false; + } + size_t buf_len = strlen(buf_ptr); + bool line_complete = buf_len && buf_ptr[buf_len - 1] == '\n'; + if (line_complete) + buf_len--; + if (buf_ptr == s.begin()) + s.resize(buf_len); + else + s.AssignNoAlias(buf, buf_len); + if (line_complete) + return true; + while (fgets(buf, sizeof(buf), f)) { + size_t buf_len2 = strlen(buf); + if (buf_len2 && buf[buf_len2 - 1] == '\n') { + buf[buf_len2 - 1] = 0; + s.append(buf, buf_len2 - 1); + return true; + } + s.append(buf, buf_len2); + } + return true; +} diff --git a/library/cpp/deprecated/fgood/fgood.h b/library/cpp/deprecated/fgood/fgood.h new file mode 100644 index 0000000000..0aaf910c0f --- /dev/null +++ b/library/cpp/deprecated/fgood/fgood.h @@ -0,0 +1,328 @@ +#pragma once + +#include <util/system/yassert.h> +#include <util/system/defaults.h> +#include <util/generic/string.h> +#include <util/generic/yexception.h> +#include <util/generic/ptr.h> + +#include "fput.h" + +#include <cstdio> + +#include <fcntl.h> + +#ifdef _unix_ +extern "C" int __ungetc(int, FILE*); +#endif + +#if (!defined(__FreeBSD__) && !defined(__linux__) && !defined(_darwin_) && !defined(_cygwin_)) || defined(_bionic_) +#define feof_unlocked(_stream) feof(_stream) +#define ferror_unlocked(_stream) ferror(_stream) +#endif + +#ifndef _unix_ +#if defined(_MSC_VER) && (_MSC_VER < 1900) +#define getc_unlocked(_stream) (--(_stream)->_cnt >= 0 ? 0xff & *(_stream)->_ptr++ : _filbuf(_stream)) +#define putc_unlocked(_c, _stream) (--(_stream)->_cnt >= 0 ? 0xff & (*(_stream)->_ptr++ = (char)(_c)) : _flsbuf((_c), (_stream))) +#else +#define getc_unlocked(_stream) getc(_stream) +#define putc_unlocked(_c, _stream) putc(_c, _stream) +#endif +#endif + +inline bool fgood(FILE* f) { + return !feof_unlocked(f) && !ferror_unlocked(f); +} + +#ifdef _win32_ +// These functions will work only with static MSVC runtime linkage. For dynamic linkage, +// fseeki64.c and ftelli64.c from CRT sources should be included in project +extern "C" int __cdecl _fseeki64(FILE*, __int64, int); +extern "C" __int64 __cdecl _ftelli64(FILE*); + +inline i64 ftello(FILE* stream) { + return _ftelli64(stream); +} + +inline int fseeko(FILE* stream, i64 offset, int origin) { + return _fseeki64(stream, offset, origin); +} +#endif + +class TFILEPtr { +private: + enum { SHOULD_CLOSE = 1, + IS_PIPE = 2 }; + FILE* m_file; + int m_Flags; + TString Name; + +public: + TFILEPtr() noexcept { + m_file = nullptr; + m_Flags = 0; + } + TFILEPtr(const TString& name, const char* mode) { + m_file = nullptr; + m_Flags = 0; + open(name, mode); + } + TFILEPtr(const TFILEPtr& src) noexcept { + m_file = src.m_file; + m_Flags = 0; + } + TFILEPtr& operator=(const TFILEPtr& src) { + if (src.m_file != m_file) { + close(); + m_file = src.m_file; + m_Flags = 0; + } + return *this; + } + explicit TFILEPtr(FILE* f) noexcept { // take ownership + m_file = f; + m_Flags = SHOULD_CLOSE; + } + TFILEPtr& operator=(FILE* f) { // take ownership + if (f != m_file) { + close(); + m_file = f; + m_Flags = SHOULD_CLOSE; + } + return *this; + } + const TString& name() const { + return Name; + } + operator FILE*() const noexcept { + return m_file; + } + FILE* operator->() const noexcept { + return m_file; + } + bool operator!() const noexcept { + return m_file == nullptr; + } + bool operator!=(FILE* f) const noexcept { + return m_file != f; + } + bool operator==(FILE* f) const noexcept { + return m_file == f; + } + ~TFILEPtr() { + close(); + } + void Y_PRINTF_FORMAT(2, 3) check(const char* message, ...) const { + if (Y_UNLIKELY(!fgood(m_file))) { + va_list args; + va_start(args, message); + char buf[512]; + vsnprintf(buf, 512, message, args); + // XXX: errno is undefined here + ythrow yexception() << buf << ": " << LastSystemErrorText() << ", " << Name.data() << " at offset " << (i64)ftell(); + } + } + TFILEPtr& assign(FILE* f, const char* name = nullptr) { // take ownership and have a name + *this = f; + if (name) + Name = name; + return *this; + } + void open(const TString& name, const char* mode) { + Y_ASSERT(!name.empty()); + Y_ASSERT(m_file == nullptr); + m_file = ::fopen(name.data(), mode); + if (!m_file) + ythrow yexception() << "can't open \'" << name << "\' with mode \'" << mode << "\': " << LastSystemErrorText(); + m_Flags = SHOULD_CLOSE; + Name = name; + } + void popen(const TString& command, const char* mode) { + Y_ASSERT(!command.empty()); + Y_ASSERT(m_file == nullptr); + m_file = ::popen(command.data(), mode); + if (!m_file) + ythrow yexception() << "can't execute \'" << command << "\' with mode \'" << mode << "\': " << LastSystemErrorText(); + m_Flags = IS_PIPE | SHOULD_CLOSE; + Name = command; + } + void close() { + if (m_file != nullptr && (m_Flags & SHOULD_CLOSE)) { + if ((m_Flags & IS_PIPE) ? ::pclose(m_file) : ::fclose(m_file)) { + m_file = nullptr; + m_Flags = 0; + if (!UncaughtException()) + ythrow yexception() << "can't close file " << Name.data() << ": " << LastSystemErrorText(); + } + } + m_file = nullptr; + m_Flags = 0; + Name.clear(); + } + size_t write(const void* buffer, size_t size, size_t count) const { + Y_ASSERT(m_file != nullptr); + size_t r = ::fwrite(buffer, size, count, m_file); + check("can't write %lu bytes", (unsigned long)size * count); + return r; + } + size_t read(void* buffer, size_t size, size_t count) const { + Y_ASSERT(m_file != nullptr); + size_t r = ::fread(buffer, size, count, m_file); + if (ferror_unlocked(m_file)) + ythrow yexception() << "can't read " << (unsigned long)size * count << " bytes: " << LastSystemErrorText() << ", " << Name.data() << " at offset " << (i64)ftell(); + return r; + } + char* fgets(char* buffer, int size) const { + Y_ASSERT(m_file != nullptr); + char* r = ::fgets(buffer, size, m_file); + if (ferror_unlocked(m_file)) + ythrow yexception() << "can't read string of maximum size " << size << ": " << LastSystemErrorText() << ", " << Name.data() << " at offset " << (i64)ftell(); + return r; + } + void Y_PRINTF_FORMAT(2, 3) fprintf(const char* format, ...) { + Y_ASSERT(m_file != nullptr); + va_list args; + va_start(args, format); + vfprintf(m_file, format, args); + check("can't write"); + } + void seek(i64 offset, int origin) const { + Y_ASSERT(m_file != nullptr); +#if defined(_unix_) || defined(_win32_) + if (fseeko(m_file, offset, origin) != 0) +#else + Y_ASSERT(offset == (i64)(i32)offset); + if (::fseek(m_file, (long)offset, origin) != 0) +#endif + ythrow yexception() << "can't seek " << Name.data() << " by " << offset << ": " << LastSystemErrorText(); + } + i64 length() const; // uses various system headers -> in fileptr.cpp + + void setDirect() const { +#if !defined(_win_) && !defined(_darwin_) + if (!m_file) + ythrow yexception() << "file not open"; + if (fcntl(fileno(m_file), F_SETFL, O_DIRECT) == -1) + ythrow yexception() << "Cannot set O_DIRECT flag"; +#endif + } + + // for convenience + + i64 ftell() const noexcept { +#if defined(_unix_) || defined(_win32_) + return ftello(m_file); +#else + return ftell(m_file); +#endif + } + bool eof() const noexcept { + Y_ASSERT(m_file != nullptr); + return feof_unlocked(m_file) != 0; + } + int fputc(int c) { + Y_ASSERT(m_file != nullptr); + return putc_unlocked(c, m_file); + } + size_t fputs(const char* buffer) const { + return write(buffer, strlen(buffer), 1); + } + int fgetc() { + Y_ASSERT(m_file != nullptr); + return getc_unlocked(m_file); + } + int ungetc(int c) { + Y_ASSERT(m_file != nullptr); + return ::ungetc(c, m_file); + } + template <class T> + size_t fput(const T& a) { + Y_ASSERT(m_file != nullptr); + return ::fput(m_file, a); + } + template <class T> + size_t fget(T& a) { + Y_ASSERT(m_file != nullptr); + return ::fget(m_file, a); + } + size_t fsput(const char* s, size_t l) { + Y_ASSERT(m_file != nullptr); + return ::fsput(m_file, s, l); + } + size_t fsget(char* s, size_t l) { + Y_ASSERT(m_file != nullptr); + return ::fsget(m_file, s, l); + } + + void fflush() { + ::fflush(m_file); + } + + /* This block contains some TFile/TStream - compatible names */ + size_t Read(void* bufferIn, size_t numBytes) { + size_t r = fsget((char*)bufferIn, numBytes); + if (Y_UNLIKELY(ferror_unlocked(m_file))) + ythrow yexception() << "can't read " << numBytes << " bytes: " << LastSystemErrorText() << ", " << Name << " at offset " << (i64)ftell(); + return r; + } + void Write(const void* buffer, size_t numBytes) { + write(buffer, 1, numBytes); + } + i64 Seek(i64 offset, int origin /*SeekDir*/) { + seek(offset, origin); + return ftell(); + } + i64 GetPosition() const noexcept { + return ftell(); + } + i64 GetLength() const noexcept { + return length(); + } + bool ReadLine(TString& st); + + /* Similar to TAutoPtr::Release - return pointer and forget about it. */ + FILE* Release() noexcept { + FILE* result = m_file; + m_file = nullptr; + m_Flags = 0; + Name.clear(); + return result; + } +}; + +inline void fclose(TFILEPtr& F) { + F.close(); +} + +inline void fseek(const TFILEPtr& F, i64 offset, int whence) { + F.seek(offset, whence); +} + +#ifdef _freebsd_ // fgetln +inline bool getline(TFILEPtr& f, TString& s) { + size_t len; + char* buf = fgetln(f, &len); + if (!buf) + return false; + if (len && buf[len - 1] == '\n') + len--; + s.AssignNoAlias(buf, len); + return true; +} +#else +bool getline(TFILEPtr& f, TString& s); +#endif //_freebsd_ + +inline bool TFILEPtr::ReadLine(TString& st) { + return getline(*this, st); +} + +FILE* OpenFILEOrFail(const TString& name, const char* mode); + +//Should be used with THolder +struct TFILECloser { + static void Destroy(FILE* file); +}; + +using TFILEHolder = THolder<FILE, TFILECloser>; diff --git a/library/cpp/deprecated/fgood/fput.h b/library/cpp/deprecated/fgood/fput.h new file mode 100644 index 0000000000..690b06332d --- /dev/null +++ b/library/cpp/deprecated/fgood/fput.h @@ -0,0 +1,79 @@ +#pragma once + +#include <util/system/defaults.h> +#include <util/system/valgrind.h> + +#include <cstdio> + +#ifdef __FreeBSD__ +#include <cstring> + +template <class T> +Y_FORCE_INLINE size_t fput(FILE* F, const T& a) { + if (Y_LIKELY(F->_w >= int(sizeof(a)))) { + memcpy(F->_p, &a, sizeof(a)); + F->_p += sizeof(a); + F->_w -= sizeof(a); + return 1; + } else { + return fwrite(&a, sizeof(a), 1, F); + } +} + +template <class T> +Y_FORCE_INLINE size_t fget(FILE* F, T& a) { + if (Y_LIKELY(F->_r >= int(sizeof(a)))) { + memcpy(&a, F->_p, sizeof(a)); + F->_p += sizeof(a); + F->_r -= sizeof(a); + return 1; + } else { + return fread(&a, sizeof(a), 1, F); + } +} + +inline size_t fsput(FILE* F, const char* s, size_t l) { + VALGRIND_CHECK_READABLE(s, l); + + if ((size_t)F->_w >= l) { + memcpy(F->_p, s, l); + F->_p += l; + F->_w -= l; + return l; + } else { + return fwrite(s, 1, l, F); + } +} + +inline size_t fsget(FILE* F, char* s, size_t l) { + if ((size_t)F->_r >= l) { + memcpy(s, F->_p, l); + F->_p += l; + F->_r -= l; + return l; + } else { + return fread(s, 1, l, F); + } +} +#else +template <class T> +Y_FORCE_INLINE size_t fput(FILE* F, const T& a) { + return fwrite(&a, sizeof(a), 1, F); +} + +template <class T> +Y_FORCE_INLINE size_t fget(FILE* F, T& a) { + return fread(&a, sizeof(a), 1, F); +} + +inline size_t fsput(FILE* F, const char* s, size_t l) { +#ifdef WITH_VALGRIND + VALGRIND_CHECK_READABLE(s, l); +#endif + return fwrite(s, 1, l, F); +} + +inline size_t fsget(FILE* F, char* s, size_t l) { + return fread(s, 1, l, F); +} +#endif diff --git a/library/cpp/deprecated/fgood/ya.make b/library/cpp/deprecated/fgood/ya.make new file mode 100644 index 0000000000..2394f9ad7a --- /dev/null +++ b/library/cpp/deprecated/fgood/ya.make @@ -0,0 +1,8 @@ +LIBRARY() + +SRCS( + ffb.cpp + fgood.cpp +) + +END() diff --git a/library/cpp/deprecated/mapped_file/CMakeLists.darwin-x86_64.txt b/library/cpp/deprecated/mapped_file/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..a00407491d --- /dev/null +++ b/library/cpp/deprecated/mapped_file/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-deprecated-mapped_file) +target_link_libraries(cpp-deprecated-mapped_file PUBLIC + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-deprecated-mapped_file PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/mapped_file/mapped_file.cpp +) diff --git a/library/cpp/deprecated/mapped_file/CMakeLists.linux-aarch64.txt b/library/cpp/deprecated/mapped_file/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..2bb5db017b --- /dev/null +++ b/library/cpp/deprecated/mapped_file/CMakeLists.linux-aarch64.txt @@ -0,0 +1,18 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-deprecated-mapped_file) +target_link_libraries(cpp-deprecated-mapped_file PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-deprecated-mapped_file PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/mapped_file/mapped_file.cpp +) diff --git a/library/cpp/deprecated/mapped_file/CMakeLists.linux-x86_64.txt b/library/cpp/deprecated/mapped_file/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..2bb5db017b --- /dev/null +++ b/library/cpp/deprecated/mapped_file/CMakeLists.linux-x86_64.txt @@ -0,0 +1,18 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-deprecated-mapped_file) +target_link_libraries(cpp-deprecated-mapped_file PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-deprecated-mapped_file PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/mapped_file/mapped_file.cpp +) diff --git a/library/cpp/deprecated/mapped_file/CMakeLists.txt b/library/cpp/deprecated/mapped_file/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/library/cpp/deprecated/mapped_file/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/library/cpp/deprecated/mapped_file/CMakeLists.windows-x86_64.txt b/library/cpp/deprecated/mapped_file/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..a00407491d --- /dev/null +++ b/library/cpp/deprecated/mapped_file/CMakeLists.windows-x86_64.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-deprecated-mapped_file) +target_link_libraries(cpp-deprecated-mapped_file PUBLIC + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-deprecated-mapped_file PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/mapped_file/mapped_file.cpp +) diff --git a/library/cpp/deprecated/mapped_file/mapped_file.cpp b/library/cpp/deprecated/mapped_file/mapped_file.cpp new file mode 100644 index 0000000000..b0e4511299 --- /dev/null +++ b/library/cpp/deprecated/mapped_file/mapped_file.cpp @@ -0,0 +1,64 @@ +#include "mapped_file.h" + +#include <util/generic/yexception.h> +#include <util/system/defaults.h> +#include <util/system/hi_lo.h> +#include <util/system/filemap.h> + +TMappedFile::TMappedFile(TFileMap* map, const char* dbgName) { + Map_ = map; + i64 len = Map_->Length(); + if (Hi32(len) != 0 && sizeof(size_t) <= sizeof(ui32)) + ythrow yexception() << "File '" << dbgName << "' mapping error: " << len << " too large"; + + Map_->Map(0, static_cast<size_t>(len)); +} + +TMappedFile::TMappedFile(const TFile& file, TFileMap::EOpenMode om, const char* dbgName) + : Map_(nullptr) +{ + init(file, om, dbgName); +} + +void TMappedFile::precharge(size_t off, size_t size) const { + if (!Map_) + return; + + Map_->Precharge(off, size); +} + +void TMappedFile::init(const TString& name) { + THolder<TFileMap> map(new TFileMap(name)); + TMappedFile newFile(map.Get(), name.data()); + Y_UNUSED(map.Release()); + newFile.swap(*this); + newFile.term(); +} + +void TMappedFile::init(const TString& name, size_t length, TFileMap::EOpenMode om) { + THolder<TFileMap> map(new TFileMap(name, length, om)); + TMappedFile newFile(map.Get(), name.data()); + Y_UNUSED(map.Release()); + newFile.swap(*this); + newFile.term(); +} + +void TMappedFile::init(const TFile& file, TFileMap::EOpenMode om, const char* dbgName) { + THolder<TFileMap> map(new TFileMap(file, om)); + TMappedFile newFile(map.Get(), dbgName); + Y_UNUSED(map.Release()); + newFile.swap(*this); + newFile.term(); +} + +void TMappedFile::init(const TString& name, TFileMap::EOpenMode om) { + THolder<TFileMap> map(new TFileMap(name, om)); + TMappedFile newFile(map.Get(), name.data()); + Y_UNUSED(map.Release()); + newFile.swap(*this); + newFile.term(); +} + +void TMappedFile::flush() { + Map_->Flush(); +} diff --git a/library/cpp/deprecated/mapped_file/ya.make b/library/cpp/deprecated/mapped_file/ya.make new file mode 100644 index 0000000000..309341f1da --- /dev/null +++ b/library/cpp/deprecated/mapped_file/ya.make @@ -0,0 +1,7 @@ +LIBRARY() + +SRCS( + mapped_file.cpp +) + +END() diff --git a/library/cpp/geo/CMakeLists.darwin-x86_64.txt b/library/cpp/geo/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..87e48b4a71 --- /dev/null +++ b/library/cpp/geo/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,24 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(library-cpp-geo) +target_link_libraries(library-cpp-geo PUBLIC + contrib-libs-cxxsupp + yutil +) +target_sources(library-cpp-geo PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/geo/bbox.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/geo.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/point.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/polygon.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/load_save_helper.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/size.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/util.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/window.cpp +) diff --git a/library/cpp/geo/CMakeLists.linux-aarch64.txt b/library/cpp/geo/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..cdad35989a --- /dev/null +++ b/library/cpp/geo/CMakeLists.linux-aarch64.txt @@ -0,0 +1,25 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(library-cpp-geo) +target_link_libraries(library-cpp-geo PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil +) +target_sources(library-cpp-geo PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/geo/bbox.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/geo.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/point.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/polygon.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/load_save_helper.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/size.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/util.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/window.cpp +) diff --git a/library/cpp/geo/CMakeLists.linux-x86_64.txt b/library/cpp/geo/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..cdad35989a --- /dev/null +++ b/library/cpp/geo/CMakeLists.linux-x86_64.txt @@ -0,0 +1,25 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(library-cpp-geo) +target_link_libraries(library-cpp-geo PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil +) +target_sources(library-cpp-geo PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/geo/bbox.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/geo.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/point.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/polygon.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/load_save_helper.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/size.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/util.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/window.cpp +) diff --git a/library/cpp/geo/CMakeLists.txt b/library/cpp/geo/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/library/cpp/geo/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/library/cpp/geo/CMakeLists.windows-x86_64.txt b/library/cpp/geo/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..87e48b4a71 --- /dev/null +++ b/library/cpp/geo/CMakeLists.windows-x86_64.txt @@ -0,0 +1,24 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(library-cpp-geo) +target_link_libraries(library-cpp-geo PUBLIC + contrib-libs-cxxsupp + yutil +) +target_sources(library-cpp-geo PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/geo/bbox.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/geo.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/point.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/polygon.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/load_save_helper.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/size.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/util.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/geo/window.cpp +) diff --git a/library/cpp/geo/bbox.cpp b/library/cpp/geo/bbox.cpp new file mode 100644 index 0000000000..aa4258ac22 --- /dev/null +++ b/library/cpp/geo/bbox.cpp @@ -0,0 +1 @@ +#include "bbox.h" diff --git a/library/cpp/geo/bbox.h b/library/cpp/geo/bbox.h new file mode 100644 index 0000000000..7ec7e6f7d6 --- /dev/null +++ b/library/cpp/geo/bbox.h @@ -0,0 +1,59 @@ +#pragma once + +#include <util/generic/utility.h> + +#include "point.h" + +namespace NGeo { + + class TGeoBoundingBox { + public: + TGeoBoundingBox() + + = default; + + TGeoBoundingBox(const TGeoPoint& p1, const TGeoPoint& p2) { + MinX_ = Min(p1.Lon(), p2.Lon()); + MaxX_ = Max(p1.Lon(), p2.Lon()); + MinY_ = Min(p1.Lat(), p2.Lat()); + MaxY_ = Max(p1.Lat(), p2.Lat()); + } + + const double& GetMinX() const { + return MinX_; + } + + const double& GetMaxX() const { + return MaxX_; + } + + const double& GetMinY() const { + return MinY_; + } + + const double& GetMaxY() const { + return MaxY_; + } + + double Width() const { + return MaxX_ - MinX_; + } + + double Height() const { + return MaxY_ - MinY_; + } + + private: + double MinX_{std::numeric_limits<double>::quiet_NaN()}; + double MaxX_{std::numeric_limits<double>::quiet_NaN()}; + double MinY_{std::numeric_limits<double>::quiet_NaN()}; + double MaxY_{std::numeric_limits<double>::quiet_NaN()}; + }; + + inline bool operator==(const TGeoBoundingBox& a, const TGeoBoundingBox& b) { + return a.GetMinX() == b.GetMinX() && + a.GetMinY() == b.GetMinY() && + a.GetMaxX() == b.GetMaxX() && + a.GetMaxY() == b.GetMaxY(); + } +} // namespace NGeo diff --git a/library/cpp/geo/geo.cpp b/library/cpp/geo/geo.cpp new file mode 100644 index 0000000000..37adc5c62c --- /dev/null +++ b/library/cpp/geo/geo.cpp @@ -0,0 +1 @@ +#include "geo.h" diff --git a/library/cpp/geo/geo.h b/library/cpp/geo/geo.h new file mode 100644 index 0000000000..1aebacab5c --- /dev/null +++ b/library/cpp/geo/geo.h @@ -0,0 +1,8 @@ +#pragma once + +#include "bbox.h" +#include "point.h" +#include "polygon.h" +#include "size.h" +#include "util.h" +#include "window.h" diff --git a/library/cpp/geo/load_save_helper.cpp b/library/cpp/geo/load_save_helper.cpp new file mode 100644 index 0000000000..13fa7ac6df --- /dev/null +++ b/library/cpp/geo/load_save_helper.cpp @@ -0,0 +1,49 @@ +#include "load_save_helper.h" +#include <util/stream/input.h> + +void TSerializer<NGeo::TGeoPoint>::Save(IOutputStream* out, const NGeo::TGeoPoint& point) { + double lon = static_cast<double>(point.Lon()); + double lat = static_cast<double>(point.Lat()); + ::Save(out, lon); + ::Save(out, lat); +} + +void TSerializer<NGeo::TGeoPoint>::Load(IInputStream* in, NGeo::TGeoPoint& point) { + double lon = std::numeric_limits<double>::quiet_NaN(); + double lat = std::numeric_limits<double>::quiet_NaN(); + ::Load(in, lon); + ::Load(in, lat); + point = {lon, lat}; +} + +void TSerializer<NGeo::TGeoWindow>::Save(IOutputStream* out, const NGeo::TGeoWindow& window) { + const auto& center = window.GetCenter(); + const auto& size = window.GetSize(); + ::Save(out, center); + ::Save(out, size); +} + +void TSerializer<NGeo::TGeoWindow>::Load(IInputStream* in, NGeo::TGeoWindow& window) { + NGeo::TSize size{}; + NGeo::TGeoPoint center{}; + + ::Load(in, center); + ::Load(in, size); + + window = {center, size}; +} + +void TSerializer<NGeo::TSize>::Save(IOutputStream* out, const NGeo::TSize& size) { + double width = static_cast<double>(size.GetWidth()); + double height = static_cast<double>(size.GetHeight()); + ::Save(out, width); + ::Save(out, height); +} + +void TSerializer<NGeo::TSize>::Load(IInputStream* in, NGeo::TSize& size) { + double width = std::numeric_limits<double>::quiet_NaN(); + double height = std::numeric_limits<double>::quiet_NaN(); + ::Load(in, width); + ::Load(in, height); + size = {width, height}; +} diff --git a/library/cpp/geo/load_save_helper.h b/library/cpp/geo/load_save_helper.h new file mode 100644 index 0000000000..4a5fceea18 --- /dev/null +++ b/library/cpp/geo/load_save_helper.h @@ -0,0 +1,23 @@ +#pragma once + +#include <library/cpp/geo/window.h> +#include <util/stream/input.h> +#include <util/ysaveload.h> + +template <> +struct TSerializer<NGeo::TGeoPoint> { + static void Save(IOutputStream*, const NGeo::TGeoPoint&); + static void Load(IInputStream*, NGeo::TGeoPoint&); +}; + +template <> +struct TSerializer<NGeo::TGeoWindow> { + static void Save(IOutputStream*, const NGeo::TGeoWindow&); + static void Load(IInputStream*, NGeo::TGeoWindow&); +}; + +template <> +struct TSerializer<NGeo::TSize> { + static void Save(IOutputStream*, const NGeo::TSize&); + static void Load(IInputStream*, NGeo::TSize&); +}; diff --git a/library/cpp/geo/point.cpp b/library/cpp/geo/point.cpp new file mode 100644 index 0000000000..1d227c967f --- /dev/null +++ b/library/cpp/geo/point.cpp @@ -0,0 +1,146 @@ +#include "point.h" +#include "util.h" + +#include <util/generic/ylimits.h> +#include <util/generic/ymath.h> + +#include <cstdlib> +#include <utility> + +namespace NGeo { + namespace { + bool IsNonDegeneratePoint(double lon, double lat) { + return (MIN_LONGITUDE - WORLD_WIDTH < lon && lon < MAX_LONGITUDE + WORLD_WIDTH) && + (MIN_LATITUDE < lat && lat < MAX_LATITUDE); + } + } // namespace + + float TGeoPoint::Distance(const TGeoPoint& p) const noexcept { + auto dp = p - (*this); + return sqrtf(Sqr(GetWidthAtEquator(dp.GetWidth(), (Lat_ + p.Lat()) * 0.5)) + Sqr(dp.GetHeight())); + } + + bool TGeoPoint::IsPole() const noexcept { + return Lat_ <= MIN_LATITUDE || MAX_LATITUDE <= Lat_; + } + + bool TGeoPoint::IsVisibleOnMap() const noexcept { + return -VISIBLE_LATITUDE_BOUND <= Lat_ && Lat_ <= VISIBLE_LATITUDE_BOUND; + } + + TGeoPoint TGeoPoint::Parse(TStringBuf s, TStringBuf delimiter) { + const auto& [lon, lat] = PairFromString(s, delimiter); + Y_ENSURE_EX(IsNonDegeneratePoint(lon, lat), TBadCastException() << "Invalid point: (" << lon << ", " << lat << ")"); + return {lon, lat}; + } + + TMaybe<TGeoPoint> TGeoPoint::TryParse(TStringBuf s, TStringBuf delimiter) { + std::pair<double, double> lonLat; + if (!TryPairFromString(lonLat, s, delimiter)) { + return {}; + } + if (!IsNonDegeneratePoint(lonLat.first, lonLat.second)) { + return {}; + } + return TGeoPoint(lonLat.first, lonLat.second); + } + + TSize operator-(const TGeoPoint& p1, const TGeoPoint& p2) { + return {p1.Lon() - p2.Lon(), p1.Lat() - p2.Lat()}; + } + + /* + Conversion code was imported from http://wiki.yandex-team.ru/YandexMobile/maps/Algorithm/mapengine/coordtransforms + */ + namespace WGS84 { + /* Isometric to geodetic latitude parameters, default to WGS 84 */ + const double ab = 0.00335655146887969400; + const double bb = 0.00000657187271079536; + const double cb = 0.00000001764564338702; + const double db = 0.00000000005328478445; + + const double _a = R; + const double _f = 1.0 / 298.257223563; + const double _b = _a - _f * _a; + const double _e = sqrt(1 - pow(_b / _a, 2)); + const double _e2 = _e * _e; + const double _g = sqrt(1.0 - _e2); + const double _gR2 = _g * R * 2.0; + } // namespace WGS84 + + TGeoPoint MercatorToLL(TMercatorPoint pt) { + using namespace WGS84; + + // Y_ENSURE(pt.IsDefined(), "Point is not defined"); + + /* Isometric latitude*/ + const double xphi = PI / 2.0 - 2.0 * atan(exp(-pt.Y_ / R)); + + double latitude = xphi + ab * sin(2.0 * xphi) + bb * sin(4.0 * xphi) + cb * sin(6.0 * xphi) + db * sin(8.0 * xphi); + double longitude = pt.X_ / R; + + return TGeoPoint{Rad2deg(longitude), Rad2deg(latitude)}; + } + + double GetMercatorY(const TGeoPoint& ll) { + if (Y_UNLIKELY(ll.Lat() == 0.)) { + // shortcut for common case, avoiding floating point errors + return 0.; + } + if (Y_UNLIKELY(ll.Lat() == MIN_LATITUDE)) { + return -std::numeric_limits<double>::infinity(); + } + if (Y_UNLIKELY(ll.Lat() == MAX_LATITUDE)) { + return +std::numeric_limits<double>::infinity(); + } + double lat = Deg2rad(ll.Lat()); + double esinLat = WGS84::_e * sin(lat); + + double tan_temp = tan(PI / 4.e0 + lat / 2.e0); + double pow_temp = pow(tan(PI / 4.e0 + asin(esinLat) / 2), WGS84::_e); + double U = tan_temp / pow_temp; + return WGS84::R * log(U); + } + + TMercatorPoint LLToMercator(TGeoPoint ll) { + // Y_ENSURE(ll.IsValid(), "Point is not defined"); + + // Y_ENSURE(-90. <= ll.Lat() && ll.Lat() <= +90., "Latitude is out of range [-90, 90]"); + + double lon = Deg2rad(ll.Lon()); + double x = WGS84::R * lon; + double y = GetMercatorY(ll); + + return TMercatorPoint{x, y}; + } + + double GeodeticDistance(TGeoPoint p1, TGeoPoint p2) { + using namespace WGS84; + + constexpr double deg2HalfRad = PI / 360.0; + + const double lon1Half = p1.Lon() * deg2HalfRad; + const double lon2Half = p2.Lon() * deg2HalfRad; + + const double lat1Half = p1.Lat() * deg2HalfRad; + const double lat2Half = p2.Lat() * deg2HalfRad; + + const double diffLatHalf = fabs(lat1Half - lat2Half); + const double diffLonHalf = fabs(lon1Half - lon2Half); + + if (diffLatHalf < 0.5e-8 && diffLonHalf < 0.5e-8) { + return 0; + } + + double s = sin(lat1Half + lat2Half); + double s2 = s * s; + double m = _gR2 / (1.0 - _e2 * s2); + + const double w = sin(diffLatHalf); + const double w2 = w * w; + const double cc = Max(1.0 - s2 - w2, 0.0); // cos(lat1Half * 2) * cos(lat2Half * 2) + const double z = sin(diffLonHalf); + + return m * asin(sqrt(w2 + cc * z * z)); + } +} // namespace NGeo diff --git a/library/cpp/geo/point.h b/library/cpp/geo/point.h new file mode 100644 index 0000000000..70c91ab2dd --- /dev/null +++ b/library/cpp/geo/point.h @@ -0,0 +1,198 @@ +#pragma once + +#include <util/generic/string.h> +#include <util/stream/output.h> +#include <util/string/cast.h> +#include <util/generic/maybe.h> + +#include <algorithm> +#include <cmath> + +namespace NGeo { + class TSize; + + class TGeoPoint { + public: + TGeoPoint(double lon, double lat) noexcept + : Lon_(lon) + , Lat_(lat) + { + } + + TGeoPoint() noexcept + : Lon_(BadX) + , Lat_(BadY) + { + } + + double Lon() const noexcept { + return Lon_; + } + + double Lat() const noexcept { + return Lat_; + } + + float Distance(const TGeoPoint& p) const noexcept; + + void swap(TGeoPoint& p) noexcept { + std::swap(Lon_, p.Lon_); + std::swap(Lat_, p.Lat_); + } + + bool IsValid() const { + return (Lon_ != BadX) && (Lat_ != BadY); + } + + /// Returns true if the point represents either North or South Pole + bool IsPole() const noexcept; + + /// Returns true if the point may be shown on the Yandex Map (fits into the valid range of latitudes) + bool IsVisibleOnMap() const noexcept; + + bool operator!() const { + return !IsValid(); + } + + TString ToCgiStr() const { + return ToString(); + } + + TString ToString(const char* delimiter = ",") const { + return TString::Join(::ToString(Lon_), delimiter, ::ToString(Lat_)); + } + + /** + * \note Parsing functions work is safe way. They discard invalid points: + * 1) on the Poles and 'beyond' the Poles; + * 2) not belonging to the 'main' world and +/-1 world to the left or to the right. + * If you need such cases, construct the TGeoPoint manually. + */ + + /// Throws TBadCastException on error + static TGeoPoint Parse(TStringBuf s, TStringBuf delimiter = TStringBuf(",")); + + /// Returns Nothing() on error + static TMaybe<TGeoPoint> TryParse(TStringBuf s, TStringBuf delimiter = TStringBuf(",")); + + private: + double Lon_; + double Lat_; + + static constexpr double BadX{361.}; + static constexpr double BadY{181.}; + }; + + double GeodeticDistance(TGeoPoint p1, TGeoPoint p2); + + /** + * \class TMercatorPoint + * + * Represents a point in EPSG:3395 projection + * (WGS 84 / World Mercator) + */ + class TMercatorPoint { + public: + friend class TMercatorWindow; + friend TGeoPoint MercatorToLL(TMercatorPoint); + + /** + * Constructs a point with the given coordinates. + */ + constexpr TMercatorPoint(double x, double y) noexcept + : X_{x} + , Y_{y} + { + } + + /** + * Constructs a point with two NaN coordinates. + * + * Should not be called directly. + * If your `point` variable might be undefined, + * declare it explicitly as TMaybe<TMercatorPoint>. + */ + constexpr TMercatorPoint() noexcept + : X_{std::numeric_limits<double>::quiet_NaN()} + , Y_{std::numeric_limits<double>::quiet_NaN()} + { + } + + /** + * Returns the X_ coordinate. + * + * The line X_ == 0 corresponds to the Prime meridian. + */ + constexpr double X() const noexcept { + return X_; + } + + /** + * Returns the Y_ coordinate. + * + * The line Y_ == 0 corresponds to the Equator. + */ + constexpr double Y() const noexcept { + return Y_; + } + + private: + bool IsDefined() const noexcept { + return !std::isnan(X_) && !std::isnan(Y_); + } + + private: + double X_; + double Y_; + }; + + /** + * Operators + */ + + inline bool operator==(const TGeoPoint& p1, const TGeoPoint& p2) { + return p1.Lon() == p2.Lon() && p1.Lat() == p2.Lat(); + } + + inline bool operator==(const TMercatorPoint& p1, const TMercatorPoint& p2) { + return p1.X() == p2.X() && p1.Y() == p2.Y(); + } + + inline bool operator<(const TGeoPoint& p1, const TGeoPoint& p2) { + if (p1.Lon() != p2.Lon()) { + return p1.Lon() < p2.Lon(); + } + return p1.Lat() < p2.Lat(); + } + + /** + * Conversion + */ + + namespace WGS84 { + /* Radius of reference ellipsoid, default to WGS 84 */ + const double R = 6378137.0; + } // namespace WGS84 + + using TPointLL = TGeoPoint; + using TPointXY = TMercatorPoint; + + TGeoPoint MercatorToLL(TMercatorPoint); + TMercatorPoint LLToMercator(TGeoPoint); + + /** + * Input/output + */ + + TSize operator-(const TGeoPoint& p1, const TGeoPoint& p2); +} // namespace NGeo + +template <> +inline void Out<NGeo::TGeoPoint>(IOutputStream& o, const NGeo::TGeoPoint& p) { + o << '[' << p.Lon() << ", " << p.Lat() << ']'; +} + +template <> +inline void Out<NGeo::TMercatorPoint>(IOutputStream& o, const NGeo::TMercatorPoint& p) { + o << '[' << p.X() << ", " << p.Y() << ']'; +} diff --git a/library/cpp/geo/polygon.cpp b/library/cpp/geo/polygon.cpp new file mode 100644 index 0000000000..44e5c38b5f --- /dev/null +++ b/library/cpp/geo/polygon.cpp @@ -0,0 +1,28 @@ +#include "polygon.h" +namespace NGeo { + TMaybe<TGeoPolygon> TGeoPolygon::TryParse(TStringBuf s, TStringBuf llDelimiter, TStringBuf pointsDelimiter) { + TVector<TGeoPoint> points; + + for (const auto& pointString : StringSplitter(s).SplitByString(pointsDelimiter).SkipEmpty()) { + auto curPoint = TGeoPoint::TryParse(pointString.Token(), llDelimiter); + if (!curPoint) { + return {}; + } + points.push_back(*curPoint); + } + + if (points.size() < 3) { + return {}; + } + + return TGeoPolygon(points); + } + + TGeoPolygon TGeoPolygon::Parse(TStringBuf s, TStringBuf llDelimiter, TStringBuf pointsDelimiter) { + auto res = TGeoPolygon::TryParse(s, llDelimiter, pointsDelimiter); + if (!res) { + ythrow yexception() << "Can't parse polygon from input string: " << s; + } + return *res; + } +} // namespace NGeo diff --git a/library/cpp/geo/polygon.h b/library/cpp/geo/polygon.h new file mode 100644 index 0000000000..1528345fec --- /dev/null +++ b/library/cpp/geo/polygon.h @@ -0,0 +1,90 @@ +#pragma once + +#include "point.h" +#include "window.h" + +#include <util/ysaveload.h> +#include <util/generic/algorithm.h> +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/generic/yexception.h> +#include <util/stream/output.h> +#include <util/string/cast.h> +#include <util/string/join.h> +#include <util/string/split.h> + +#include <algorithm> +#include <functional> + +namespace NGeo { + class TGeoPolygon { + private: + TVector<TGeoPoint> Points_; + TGeoWindow Window_; + + public: + TGeoPolygon() = default; + + explicit TGeoPolygon(const TVector<TGeoPoint>& points) + : Points_(points) + { + CalcWindow(); + } + + const TVector<TGeoPoint>& GetPoints() const { + return Points_; + } + + const TGeoWindow& GetWindow() const { + return Window_; + } + + void swap(TGeoPolygon& o) noexcept { + Points_.swap(o.Points_); + Window_.swap(o.Window_); + } + + bool IsValid() const noexcept { + return !Points_.empty() && Window_.IsValid(); + } + + bool operator!() const { + return !IsValid(); + } + + /** + * try to parse TGeoPolygon from string which stores points + * coords are separated by llDelimiter, points are separated by pointsDelimiter + * return parsed TGeoPolygon on success, otherwise throw exception + */ + static TGeoPolygon Parse(TStringBuf s, TStringBuf llDelimiter = ",", TStringBuf pointsDelimiter = TStringBuf(" ")); + + /** + * try to parse TGeoPolygon from string which stores points + * coords are separated by llDelimiter, points are separated by pointsDelimiter + * return TMaybe of parsed TGeoPolygon on success, otherwise return empty TMaybe + */ + static TMaybe<TGeoPolygon> TryParse(TStringBuf s, TStringBuf llDelimiter = ",", TStringBuf pointsDelimiter = TStringBuf(" ")); + + private: + void CalcWindow() { + auto getLon = std::mem_fn(&TGeoPoint::Lon); + double lowerX = MinElementBy(Points_.begin(), Points_.end(), getLon)->Lon(); + double upperX = MaxElementBy(Points_.begin(), Points_.end(), getLon)->Lon(); + + auto getLat = std::mem_fn(&TGeoPoint::Lat); + double lowerY = MinElementBy(Points_.begin(), Points_.end(), getLat)->Lat(); + double upperY = MaxElementBy(Points_.begin(), Points_.end(), getLat)->Lat(); + + Window_ = TGeoWindow{TGeoPoint{lowerX, lowerY}, TGeoPoint{upperX, upperY}}; + } + }; + + inline bool operator==(const TGeoPolygon& p1, const TGeoPolygon& p2) { + return p1.GetPoints() == p2.GetPoints(); + } + + inline bool operator!=(const TGeoPolygon& p1, const TGeoPolygon& p2) { + return !(p1 == p2); + } +} // namespace NGeo diff --git a/library/cpp/geo/size.cpp b/library/cpp/geo/size.cpp new file mode 100644 index 0000000000..f1bd8ab763 --- /dev/null +++ b/library/cpp/geo/size.cpp @@ -0,0 +1,31 @@ +#include "size.h" + +#include "util.h" + +namespace NGeo { + const double TSize::BadWidth = -1.; + const double TSize::BadHeight = -1.; + + namespace { + bool IsNonNegativeSize(double width, double height) { + return width >= 0. && height >= 0.; + } + } // namespace + + TSize TSize::Parse(TStringBuf s, TStringBuf delimiter) { + const auto& [width, height] = PairFromString(s, delimiter); + Y_ENSURE_EX(IsNonNegativeSize(width, height), TBadCastException() << "Negative window size"); + return {width, height}; + } + + TMaybe<TSize> TSize::TryParse(TStringBuf s, TStringBuf delimiter) { + std::pair<double, double> lonLat; + if (!TryPairFromString(lonLat, s, delimiter)) { + return {}; + } + if (!IsNonNegativeSize(lonLat.first, lonLat.second)) { + return {}; + } + return TSize{lonLat.first, lonLat.second}; + } +} // namespace NGeo diff --git a/library/cpp/geo/size.h b/library/cpp/geo/size.h new file mode 100644 index 0000000000..b619c6d899 --- /dev/null +++ b/library/cpp/geo/size.h @@ -0,0 +1,93 @@ +#pragma once + +#include <util/generic/string.h> +#include <util/stream/output.h> +#include <util/string/cast.h> + +namespace NGeo { + class TSize { + public: + TSize(double width, double height) noexcept + : Width_(width) + , Height_(height) + { + } + + explicit TSize(double size) noexcept + : Width_(size) + , Height_(size) + { + } + + TSize() noexcept + : Width_(BadWidth) + , Height_(BadHeight) + { + } + + double GetWidth() const noexcept { + return Width_; + } + + double GetHeight() const noexcept { + return Height_; + } + + void swap(TSize& s) noexcept { + std::swap(Width_, s.Width_); + std::swap(Height_, s.Height_); + } + + bool IsValid() const { + return (Width_ != BadWidth) && (Height_ != BadHeight); + } + + void Stretch(double multiplier) { + Width_ *= multiplier; + Height_ *= multiplier; + } + + void Inflate(double additionX, double additionY) { + Width_ += additionX; + Height_ += additionY; + } + + bool operator!() const { + return !IsValid(); + } + + TString ToCgiStr() const { + TString s = ToString(Width_); + s.append(','); + s.append(ToString(Height_)); + return s; + } + + /** + * try to parse TSize + * return parsed TSize on success, otherwise throw exception + */ + static TSize Parse(TStringBuf s, TStringBuf delimiter = TStringBuf(",")); + + /** + * try to parse TSize + * return TMaybe of parsed TSize on success, otherwise return empty TMaybe + */ + static TMaybe<TSize> TryParse(TStringBuf s, TStringBuf delimiter = TStringBuf(",")); + + private: + double Width_; + double Height_; + static const double BadWidth; + static const double BadHeight; + }; + + inline bool operator==(const TSize& p1, const TSize& p2) { + return p1.GetHeight() == p2.GetHeight() && p1.GetWidth() == p2.GetWidth(); + } +} // namespace NGeo + +template <> +inline void Out<NGeo::TSize>(IOutputStream& o, const NGeo::TSize& s) { + o << '<' << s.GetWidth() << ", " << s.GetHeight() << '>'; +} diff --git a/library/cpp/geo/style/ya.make b/library/cpp/geo/style/ya.make new file mode 100644 index 0000000000..f72d50f27e --- /dev/null +++ b/library/cpp/geo/style/ya.make @@ -0,0 +1,8 @@ +CPP_STYLE_TEST_14() + +STYLE( + library/cpp/geo/**/*.cpp + library/cpp/geo/**/*.h +) + +END() diff --git a/library/cpp/geo/ut/load_save_helper_ut.cpp b/library/cpp/geo/ut/load_save_helper_ut.cpp new file mode 100644 index 0000000000..f251f56630 --- /dev/null +++ b/library/cpp/geo/ut/load_save_helper_ut.cpp @@ -0,0 +1,90 @@ +#include "load_save_helper.h" +#include "point.h" + +#include <library/cpp/testing/unittest/registar.h> +#include <util/stream/str.h> +#include <util/ysaveload.h> + +namespace { + void CheckSave(const NGeo::TGeoPoint& point) { + TStringStream output; + ::Save(&output, point); + TStringStream answer; + ::Save(&answer, static_cast<double>(point.Lon())); + ::Save(&answer, static_cast<double>(point.Lat())); + UNIT_ASSERT_EQUAL(output.Str(), answer.Str()); + } + + void CheckLoad(const double x, const double y) { + TStringStream input; + ::Save(&input, x); + ::Save(&input, y); + NGeo::TGeoPoint output; + ::Load(&input, output); + + const double eps = 1.E-8; + UNIT_ASSERT_DOUBLES_EQUAL(static_cast<double>(output.Lon()), x, eps); + UNIT_ASSERT_DOUBLES_EQUAL(static_cast<double>(output.Lat()), y, eps); + } + + void CheckLoadAfterSavePointLL(double x, double y) { + NGeo::TGeoPoint answer = {x, y}; + TStringStream iostream; + ::Save(&iostream, answer); + NGeo::TGeoPoint output; + ::Load(&iostream, output); + + const double eps = 1.E-8; + UNIT_ASSERT_DOUBLES_EQUAL(static_cast<double>(output.Lon()), x, eps); + UNIT_ASSERT_DOUBLES_EQUAL(static_cast<double>(output.Lat()), y, eps); + } + + void CheckLoadAfterSaveWindowLL(NGeo::TGeoPoint center, NGeo::TSize size) { + NGeo::TGeoWindow answer = {center, size}; + TStringStream iostream; + ::Save(&iostream, answer); + NGeo::TGeoWindow output; + ::Load(&iostream, output); + UNIT_ASSERT_EQUAL(output.GetCenter(), answer.GetCenter()); + UNIT_ASSERT_EQUAL(output.GetSize(), answer.GetSize()); + } +} // namespace + +Y_UNIT_TEST_SUITE(TSaveLoadForPointLL) { + Y_UNIT_TEST(TestSave) { + // {27.561481, 53.902496} Minsk Lon and Lat + CheckSave({27.561481, 53.902496}); + CheckSave({-27.561481, 53.902496}); + CheckSave({27.561481, -53.902496}); + CheckSave({-27.561481, -53.902496}); + } + + Y_UNIT_TEST(TestLoad) { + CheckLoad(27.561481, 53.902496); + CheckLoad(-27.561481, 53.902496); + CheckLoad(27.561481, -53.902496); + CheckLoad(-27.561481, -53.902496); + } + + Y_UNIT_TEST(TestSaveLoad) { + CheckLoadAfterSavePointLL(27.561481, 53.902496); + CheckLoadAfterSavePointLL(-27.561481, 53.902496); + CheckLoadAfterSavePointLL(27.561481, -53.902496); + CheckLoadAfterSavePointLL(-27.561481, -53.902496); + CheckLoadAfterSavePointLL(0, 0); + } +} + +Y_UNIT_TEST_SUITE(TSaveLoadForWindowLL) { + Y_UNIT_TEST(TestSave) { + CheckLoadAfterSaveWindowLL({27.561481, 53.902496}, {1, 2}); + CheckLoadAfterSaveWindowLL({27.561481, 53.902496}, {2, 1}); + CheckLoadAfterSaveWindowLL({-27.561481, 53.902496}, {1, 2}); + CheckLoadAfterSaveWindowLL({-27.561481, 53.902496}, {2, 1}); + CheckLoadAfterSaveWindowLL({27.561481, -53.902496}, {1, 2}); + CheckLoadAfterSaveWindowLL({27.561481, -53.902496}, {2, 1}); + CheckLoadAfterSaveWindowLL({-27.561481, -53.902496}, {1, 2}); + CheckLoadAfterSaveWindowLL({-27.561481, -53.902496}, {2, 1}); + CheckLoadAfterSaveWindowLL({0, 0}, {0, 0}); + } +} diff --git a/library/cpp/geo/ut/point_ut.cpp b/library/cpp/geo/ut/point_ut.cpp new file mode 100644 index 0000000000..bbf8f32cea --- /dev/null +++ b/library/cpp/geo/ut/point_ut.cpp @@ -0,0 +1,171 @@ +#include "point.h" + +#include <library/cpp/testing/unittest/registar.h> + +using namespace NGeo; + +namespace { + void CheckMercator(TGeoPoint input, TMercatorPoint answer, double eps = 1.e-8) { + auto output = LLToMercator(input); + UNIT_ASSERT_DOUBLES_EQUAL(output.X(), answer.X(), eps); + UNIT_ASSERT_DOUBLES_EQUAL(output.Y(), answer.Y(), eps); + } + + void CheckGeo(TMercatorPoint input, TGeoPoint answer, double eps = 1.e-8) { + auto output = MercatorToLL(input); + UNIT_ASSERT_DOUBLES_EQUAL(output.Lon(), answer.Lon(), eps); + UNIT_ASSERT_DOUBLES_EQUAL(output.Lat(), answer.Lat(), eps); + } +} // namespace + +Y_UNIT_TEST_SUITE(TPointTest) { + Y_UNIT_TEST(TestGeoPointFromString) { + UNIT_ASSERT_EQUAL(TGeoPoint::Parse("0.15,0.67"), + TGeoPoint(0.15, 0.67)); + UNIT_ASSERT_EQUAL(TGeoPoint::Parse("-52.,-27."), + TGeoPoint(-52., -27.)); + UNIT_ASSERT_EQUAL(TGeoPoint::Parse("0.15 0.67", " "), + TGeoPoint(0.15, 0.67)); + UNIT_ASSERT_EQUAL(TGeoPoint::Parse("-27. -52", " "), + TGeoPoint(-27., -52.)); + UNIT_ASSERT_EQUAL(TGeoPoint::Parse("182,55"), + TGeoPoint(182., 55.)); + + // current behavior + UNIT_ASSERT(TGeoPoint::TryParse(TString{}).Empty()); + UNIT_ASSERT_EXCEPTION(TGeoPoint::Parse("Hello,world"), TBadCastException); + UNIT_ASSERT_EXCEPTION(TGeoPoint::Parse("640 17", " "), TBadCastException); + UNIT_ASSERT_EXCEPTION(TGeoPoint::Parse("50.,100"), TBadCastException); + UNIT_ASSERT_EQUAL(TGeoPoint::Parse(" 0.01, 0.01"), TGeoPoint(0.01, 0.01)); + UNIT_ASSERT_EXCEPTION(TGeoPoint::Parse("0.01 , 0.01"), TBadCastException); + UNIT_ASSERT_EXCEPTION(TGeoPoint::Parse("0.01, 0.01 "), TBadCastException); + } +} + +Y_UNIT_TEST_SUITE(TConversionTest) { + Y_UNIT_TEST(TestConversionGeoToMercator) { + // test data is obtained using PostGIS: + // SELECT ST_AsText(ST_Transform(ST_SetSRID(ST_MakePoint(lon, lat), 4326), 3395)) + + CheckMercator({27.547028, 53.893962}, {3066521.12982805, 7115552.47353991}); + CheckMercator({-70.862782, -53.002613}, {-7888408.80843475, -6949331.55685883}); + CheckMercator({37.588536, 55.734004}, {4184336.68718463, 7470303.90973406}); + CheckMercator({0., 0.}, {0, 0}); + } + + Y_UNIT_TEST(TestConversionMercatorToGeo) { + // test data is obtained using PostGIS: + // SELECT ST_AsText(ST_Transform(ST_SetSRID(ST_MakePoint(X, Y), 3395), 4326)) + + CheckGeo({3066521, 7115552}, {27.5470268337348, 53.8939594873943}); + CheckGeo({-7888409, -6949332}, {-70.8627837208599, -53.0026154014032}); + CheckGeo({4184336, 7470304}, {37.5885298269154, 55.734004457522}); + CheckGeo({0, 0}, {0., 0.}); + } + + Y_UNIT_TEST(TestExactConversion) { + // Zero maps to zero with no epsilons + UNIT_ASSERT_VALUES_EQUAL(LLToMercator({0., 0.}).X(), 0.); + UNIT_ASSERT_VALUES_EQUAL(LLToMercator({0., 0.}).Y(), 0.); + UNIT_ASSERT_VALUES_EQUAL(MercatorToLL({0., 0.}).Lon(), 0.); + UNIT_ASSERT_VALUES_EQUAL(MercatorToLL({0., 0.}).Lat(), 0.); + } + + Y_UNIT_TEST(TestPoles) { + UNIT_ASSERT_VALUES_EQUAL(LLToMercator({0, 90}).Y(), std::numeric_limits<double>::infinity()); + UNIT_ASSERT_VALUES_EQUAL(LLToMercator({0, -90}).Y(), -std::numeric_limits<double>::infinity()); + + UNIT_ASSERT_VALUES_EQUAL(MercatorToLL({0, std::numeric_limits<double>::infinity()}).Lat(), 90.); + UNIT_ASSERT_VALUES_EQUAL(MercatorToLL({0, -std::numeric_limits<double>::infinity()}).Lat(), -90.); + } + + Y_UNIT_TEST(TestNearPoles) { + // Reference values were obtained using mpmath library (floating-point arithmetic with arbitrary precision) + CheckMercator({0., 89.9}, {0., 44884542.157175040}, 1.e-6); + CheckMercator({0., 89.99}, {0., 59570746.872518855}, 1.e-5); + CheckMercator({0., 89.999}, {0., 74256950.065173316}, 1.e-4); + CheckMercator({0., 89.9999}, {0., 88943153.242600886}, 1.e-3); + CheckMercator({0., 89.99999}, {0., 103629356.41987618}, 1.e-1); + CheckMercator({0., 89.999999}, {0., 118315559.59714996}, 1.e-1); + CheckMercator({0., 89.9999999}, {0., 133001762.77442373}, 1.e-0); + CheckMercator({0., 89.99999999}, {0., 147687965.95169749}, 1.e+1); + CheckMercator({0., 89.9999999999999857891452847979962825775146484375}, {0., 233563773.75716050}, 1.e+7); + + CheckGeo({0., 233563773.75716050}, {0., 89.9999999999999857891452847979962825775146484375}, 1.e-15); + CheckGeo({0., 147687965.95169749}, {0., 89.99999999}, 1.e-13); + CheckGeo({0., 133001762.77442373}, {0., 89.9999999}, 1.e-13); + CheckGeo({0., 118315559.59714996}, {0., 89.999999}, 1.e-13); + CheckGeo({0., 103629356.41987618}, {0., 89.99999}, 1.e-13); + CheckGeo({0., 88943153.242600886}, {0., 89.9999}, 1.e-13); + CheckGeo({0., 74256950.065173316}, {0., 89.999}, 1.e-13); + CheckGeo({0., 59570746.872518855}, {0., 89.99}, 1.e-13); + CheckGeo({0., 44884542.157175040}, {0., 89.9}, 1.e-13); + } + + Y_UNIT_TEST(TestVisibleRange) { + UNIT_ASSERT(TGeoPoint(37., 55.).IsVisibleOnMap()); + UNIT_ASSERT(!TGeoPoint(37., 86.).IsVisibleOnMap()); + UNIT_ASSERT(TGeoPoint(37., -85.).IsVisibleOnMap()); + UNIT_ASSERT(!TGeoPoint(37., -90.).IsVisibleOnMap()); + } + + Y_UNIT_TEST(TestRoundTripGeoMercatorGeo) { + auto check = [](double longitude, double latitude) { + auto pt = MercatorToLL(LLToMercator(TGeoPoint{longitude, latitude})); + UNIT_ASSERT_DOUBLES_EQUAL_C(longitude, pt.Lon(), 1.e-12, "longitude for point (" << longitude << ", " << latitude << ")"); + UNIT_ASSERT_DOUBLES_EQUAL_C(latitude, pt.Lat(), 1.e-8, "latitude for point (" << longitude << ", " << latitude << ")"); + }; + + check(37., 55.); + check(0.1, 0.1); + check(0.2, 89.9); + check(181., -42.); + check(362., -43.); + check(-183., -87.); + check(1000., -77.); + } + + Y_UNIT_TEST(TestRoundTripMercatorGeoMercator) { + auto check = [](double x, double y) { + auto pt = LLToMercator(MercatorToLL(TMercatorPoint{x, y})); + UNIT_ASSERT_DOUBLES_EQUAL_C(x, pt.X(), 1.e-4, "x for point (" << x << ", " << y << ")"); + UNIT_ASSERT_DOUBLES_EQUAL_C(y, pt.Y(), 1.e-4, "y for point (" << x << ", " << y << ")"); + }; + + check(100., 200.); + check(-123456., 654321.); + check(5.e7, 1.23456789); + check(1.e8, -2.e7); + } +} + +Y_UNIT_TEST_SUITE(TestDistance) { + Y_UNIT_TEST(TestGeodeticDistance) { + const TGeoPoint minsk(27.55, 53.916667); + const TGeoPoint moscow(37.617778, 55.755833); + const TGeoPoint newYork(-73.994167, 40.728333); + const TGeoPoint sydney(151.208333, -33.869444); + + const double eps = 1.E-6; // absolute error + + UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(minsk, minsk), 0.0, eps); + UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(minsk, moscow), 677190.08871321136, eps); + UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(minsk, newYork), 7129091.7536358498, eps); + UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(minsk, sydney), 15110861.267782301, eps); + + UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(moscow, minsk), 677190.08871321136, eps); + UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(moscow, moscow), 0.0, eps); + UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(moscow, newYork), 7519517.2469277605, eps); + UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(moscow, sydney), 14467193.188083574, eps); + + UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(newYork, minsk), 7129091.7536358498, eps); + UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(newYork, moscow), 7519517.2469277605, eps); + UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(newYork, newYork), 0.0, eps); + UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(newYork, sydney), 15954603.669226252, eps); + + UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(sydney, minsk), 15110861.267782301, eps); + UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(sydney, moscow), 14467193.188083574, eps); + UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(sydney, newYork), 15954603.669226252, eps); + UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(sydney, sydney), 0.0, eps); + } +} diff --git a/library/cpp/geo/ut/polygon_ut.cpp b/library/cpp/geo/ut/polygon_ut.cpp new file mode 100644 index 0000000000..cd9dee9759 --- /dev/null +++ b/library/cpp/geo/ut/polygon_ut.cpp @@ -0,0 +1,34 @@ +#include "polygon.h" + +#include <library/cpp/testing/unittest/registar.h> + +using namespace NGeo; + +Y_UNIT_TEST_SUITE(TGeoPolygonTest) { + Y_UNIT_TEST(TestEmptyPolygon) { + TGeoPolygon empty; + UNIT_ASSERT(!empty); + UNIT_ASSERT(!empty.IsValid()); + } + + Y_UNIT_TEST(TestPolygon) { + TGeoPolygon polygon({{1., 2.}, {2., 1.}, {2., 4.}, {1., 3.}}); + UNIT_ASSERT(polygon.IsValid()); + UNIT_ASSERT_EQUAL(polygon.GetWindow(), + TGeoWindow(TGeoPoint(1., 1.), TGeoPoint(2., 4.))); + } + + Y_UNIT_TEST(TestParse) { + UNIT_ASSERT_EQUAL(TGeoPolygon::Parse(TString{"1.23,5.67 7.89,10.11 11.10,9.87"}), + NGeo::TGeoPolygon({{1.23, 5.67}, {7.89, 10.11}, {11.10, 9.87}})); + UNIT_ASSERT_EQUAL(TGeoPolygon::Parse(TString{"1.23,5.67 7.89,10.11 11.10,9.87 6.54,3.21"}), + NGeo::TGeoPolygon({{1.23, 5.67}, {7.89, 10.11}, {11.10, 9.87}, {6.54, 3.21}})); + + UNIT_ASSERT(TGeoPolygon::TryParse(TString{"1.23,5.67 7.89,10.11"}).Empty()); + UNIT_ASSERT_EQUAL(TGeoPolygon::Parse(TString{"1.23+5.67~7.89+10.11~11.10+9.87"}, "+", "~"), + NGeo::TGeoPolygon({{1.23, 5.67}, {7.89, 10.11}, {11.10, 9.87}})); + + UNIT_ASSERT_EQUAL(TGeoPolygon::Parse(TString{"1.23+5.67+~7.89+10.11+~11.10+9.87"}, "+", "+~"), + NGeo::TGeoPolygon({{1.23, 5.67}, {7.89, 10.11}, {11.10, 9.87}})); + } +} diff --git a/library/cpp/geo/ut/size_ut.cpp b/library/cpp/geo/ut/size_ut.cpp new file mode 100644 index 0000000000..41b4a2c257 --- /dev/null +++ b/library/cpp/geo/ut/size_ut.cpp @@ -0,0 +1,29 @@ +#include "size.h" + +#include <library/cpp/testing/unittest/registar.h> +#include <util/generic/maybe.h> + +using namespace NGeo; + +Y_UNIT_TEST_SUITE(TSizeTest) { + Y_UNIT_TEST(TestFromString) { + UNIT_ASSERT_EQUAL(TSize::Parse("0.15,0.67"), TSize(0.15, 0.67)); + UNIT_ASSERT_EQUAL(TSize::Parse("0.15 0.67", " "), TSize(0.15, 0.67)); + + UNIT_ASSERT_EXCEPTION(TSize::Parse(""), TBadCastException); + UNIT_ASSERT_EXCEPTION(TSize::Parse("Hello,world"), TBadCastException); + UNIT_ASSERT_EXCEPTION(TSize::Parse("-1,-1"), TBadCastException); + + UNIT_ASSERT_EQUAL(TSize::Parse("424242 50", " "), TSize(424242., 50.)); + UNIT_ASSERT_EQUAL(TSize::Parse("50.,424242"), TSize(50., 424242.)); + UNIT_ASSERT_EQUAL(TSize::Parse(" 0.01, 0.01"), TSize(0.01, 0.01)); + UNIT_ASSERT_EXCEPTION(TSize::Parse("0.01 ,0.01"), TBadCastException); + UNIT_ASSERT_EXCEPTION(TSize::Parse("0.01,0.01 "), TBadCastException); + } + + Y_UNIT_TEST(TestTryFromString) { + UNIT_ASSERT(TSize::TryParse("1,2")); + UNIT_ASSERT(!TSize::TryParse("-1,-2")); + UNIT_ASSERT(!TSize::TryParse("1,2a")); + } +} diff --git a/library/cpp/geo/ut/util_ut.cpp b/library/cpp/geo/ut/util_ut.cpp new file mode 100644 index 0000000000..ebd86cfbd8 --- /dev/null +++ b/library/cpp/geo/ut/util_ut.cpp @@ -0,0 +1,36 @@ +#include <library/cpp/geo/util.h> + +#include <library/cpp/testing/unittest/registar.h> + +using namespace NGeo; + +Y_UNIT_TEST_SUITE(TGeoUtilTest) { + Y_UNIT_TEST(TestPointFromString) { + UNIT_ASSERT_EQUAL(PairFromString("27.56,53.90"), (std::pair<double, double>(27.56, 53.90))); + UNIT_ASSERT_EQUAL(PairFromString("27.56 53.90", " "), (std::pair<double, double>(27.56, 53.90))); + UNIT_ASSERT_EQUAL(PairFromString("27.56@@53.90", "@@"), (std::pair<double, double>(27.56, 53.90))); + UNIT_ASSERT_EXCEPTION(PairFromString("27.56@@53.90", "@"), TBadCastException); + UNIT_ASSERT_EXCEPTION(PairFromString(""), TBadCastException); + } + + Y_UNIT_TEST(TestTryPointFromString) { + std::pair<double, double> point; + + UNIT_ASSERT(TryPairFromString(point, "27.56,53.90")); + UNIT_ASSERT_EQUAL(point, (std::pair<double, double>(27.56, 53.90))); + + UNIT_ASSERT(TryPairFromString(point, "27.56 53.90", " ")); + UNIT_ASSERT_EQUAL(point, (std::pair<double, double>(27.56, 53.90))); + + UNIT_ASSERT(TryPairFromString(point, "27.56@@53.90", "@@")); + UNIT_ASSERT_EQUAL(point, (std::pair<double, double>(27.56, 53.90))); + + UNIT_ASSERT(!TryPairFromString(point, "27.56@@53.90", "@")); + UNIT_ASSERT(!TryPairFromString(point, "")); + } + + Y_UNIT_TEST(TestVisibleMapBound) { + const double expectedLat = MercatorToLL(TMercatorPoint(0., LLToMercator(TGeoPoint(180., 0.)).X())).Lat(); + UNIT_ASSERT_DOUBLES_EQUAL(VISIBLE_LATITUDE_BOUND, expectedLat, 1.e-14); + } +} diff --git a/library/cpp/geo/ut/window_ut.cpp b/library/cpp/geo/ut/window_ut.cpp new file mode 100644 index 0000000000..194fb4e735 --- /dev/null +++ b/library/cpp/geo/ut/window_ut.cpp @@ -0,0 +1,547 @@ +#include "window.h" +#include <library/cpp/testing/unittest/registar.h> +#include <util/generic/ymath.h> + +using namespace NGeo; + +namespace { + constexpr double DEFAULT_EPS = 1.E-5; + + bool CheckGeoPointEqual(const TGeoPoint& found, const TGeoPoint& expected, const double eps = DEFAULT_EPS) { + if (std::isnan(found.Lon()) || std::isnan(found.Lat())) { + Cerr << "NaNs found: (" << found.Lon() << ", " << found.Lat() << ")" << Endl; + return false; + } + if (Abs(found.Lon() - expected.Lon()) > eps) { + Cerr << "longitude differs: " << found.Lon() << " found, " << expected.Lon() << " expected" << Endl; + return false; + } + if (Abs(found.Lat() - expected.Lat()) > eps) { + Cerr << "latitude differs: " << found.Lat() << " found, " << expected.Lat() << " expected" << Endl; + return false; + } + return true; + } + + bool CheckSizeEqual(const TSize& found, const TSize& expected, const double eps = DEFAULT_EPS) { + if (std::isnan(found.GetWidth()) || std::isnan(found.GetHeight())) { + Cerr << "NaNs found: (" << found.GetWidth() << ", " << found.GetHeight() << ")" << Endl; + return false; + } + if (Abs(found.GetWidth() - expected.GetWidth()) > eps) { + Cerr << "width differs: " << found.GetWidth() << " found, " << expected.GetWidth() << " expected" << Endl; + return false; + } + if (Abs(found.GetHeight() - expected.GetHeight()) > eps) { + Cerr << "height differs: " << found.GetHeight() << " found, " << expected.GetHeight() << " expected" << Endl; + return false; + } + return true; + } + + bool CheckGeoWindowEqual(const TGeoWindow& lhs, const TGeoWindow& rhs, const double eps = DEFAULT_EPS) { + return CheckGeoPointEqual(lhs.GetCenter(), rhs.GetCenter(), eps) && CheckSizeEqual(lhs.GetSize(), rhs.GetSize(), eps); + } +} // namespace + +/** + * TGeoWindow + */ +Y_UNIT_TEST_SUITE(TGeoWindowTest) { + Y_UNIT_TEST(TestParser) { + UNIT_ASSERT_EQUAL(TGeoWindow::ParseFromCornersPoints("1.23,5.67", "7.65,3.21"), + TGeoWindow(TGeoPoint(1.23, 3.21), TGeoPoint(7.65, 5.67))); + UNIT_ASSERT_EQUAL(TGeoWindow::ParseFromCornersPoints("1.23~5.67", "7.65~3.21", "~"), + TGeoWindow(TGeoPoint(1.23, 3.21), TGeoPoint(7.65, 5.67))); + UNIT_ASSERT_EXCEPTION(TGeoWindow::ParseFromCornersPoints("1.23~5.67", "7.65~3.21"), TBadCastException); + + UNIT_ASSERT(TGeoWindow::TryParseFromCornersPoints("1.23~5.67", "7.65~3.21").Empty()); + UNIT_ASSERT(TGeoWindow::TryParseFromCornersPoints("1.23,5.67", "7.65,3.21").Defined()); + UNIT_ASSERT_EQUAL(TGeoWindow::TryParseFromCornersPoints("1.23,5.67", "7.65,3.21").GetRef(), + TGeoWindow(TGeoPoint(1.23, 3.21), TGeoPoint(7.65, 5.67))); + UNIT_ASSERT(TGeoWindow::TryParseFromCornersPoints("1.23+++5.67+", "7.65+++3.21+", "+++").Empty()); + + UNIT_ASSERT_EQUAL(TGeoWindow::ParseFromLlAndSpn("1.23,5.67", "0.1,0.2"), + TGeoWindow(TGeoPoint(1.23, 5.67), TSize(0.1, 0.2))); + UNIT_ASSERT_EQUAL(TGeoWindow::ParseFromLlAndSpn("1.23~5.67", "0.1~0.2", "~"), + TGeoWindow(TGeoPoint(1.23, 5.67), TSize(0.1, 0.2))); + UNIT_ASSERT_EXCEPTION(TGeoWindow::ParseFromLlAndSpn("1.23~5.67", "0.1~0.2"), TBadCastException); + UNIT_ASSERT(TGeoWindow::TryParseFromLlAndSpn("1.23~5.67", "0.1~0.2").Empty()); + UNIT_ASSERT(TGeoWindow::TryParseFromLlAndSpn("1.23~5.67", "0.1~0.2", "~").Defined()); + UNIT_ASSERT_EQUAL(TGeoWindow::TryParseFromLlAndSpn("1.23~5.67", "0.1~0.2", "~").GetRef(), + TGeoWindow(TGeoPoint(1.23, 5.67), TSize(0.1, 0.2))); + } + + Y_UNIT_TEST(TestConstructor) { + TGeoPoint center{55.50, 82.50}; + TSize size{5.00, 3.00}; + TGeoWindow window(center, size); + + UNIT_ASSERT_EQUAL(window.GetCenter(), center); + UNIT_ASSERT_EQUAL(window.GetSize(), size); + } + + Y_UNIT_TEST(TestPoles) { + { + TGeoWindow northPole{TGeoPoint{180., 90.}, TSize{1.5, 1.5}}; + UNIT_ASSERT(CheckGeoPointEqual(northPole.GetCenter(), TGeoPoint{180., 90.})); + UNIT_ASSERT(CheckGeoPointEqual(northPole.GetLowerLeftCorner(), TGeoPoint{179.25, 88.5})); + UNIT_ASSERT(CheckGeoPointEqual(northPole.GetUpperRightCorner(), TGeoPoint{180.75, 90.0})); + } + { + TGeoWindow tallWindow{TGeoPoint{37., 55.}, TSize{10., 180.}}; + UNIT_ASSERT(CheckGeoPointEqual(tallWindow.GetCenter(), TGeoPoint{37., 55.})); + UNIT_ASSERT(CheckGeoPointEqual(tallWindow.GetLowerLeftCorner(), TGeoPoint{32., -90.})); + UNIT_ASSERT(CheckGeoPointEqual(tallWindow.GetUpperRightCorner(), TGeoPoint{42., 90.})); + } + { + TGeoWindow world{TGeoPoint{0., 0.}, TSize{360., 180.}}; + UNIT_ASSERT(CheckGeoPointEqual(world.GetCenter(), TGeoPoint{0., 0.})); + UNIT_ASSERT(CheckGeoPointEqual(world.GetLowerLeftCorner(), TGeoPoint{-180., -90.})); + UNIT_ASSERT(CheckGeoPointEqual(world.GetUpperRightCorner(), TGeoPoint{180., 90.})); + } + { + TGeoWindow world{TGeoPoint{0., 0.}, TSize{360., 360.}}; + UNIT_ASSERT(CheckGeoPointEqual(world.GetCenter(), TGeoPoint{0., 0.})); + UNIT_ASSERT(CheckGeoPointEqual(world.GetLowerLeftCorner(), TGeoPoint{-180., -90.})); + UNIT_ASSERT(CheckGeoPointEqual(world.GetUpperRightCorner(), TGeoPoint{180., 90.})); + } + } + + Y_UNIT_TEST(TestBigSize) { + { + TGeoWindow w{TGeoPoint{37., 55.}, TSize{100., 179.}}; + UNIT_ASSERT(CheckGeoPointEqual(w.GetCenter(), TGeoPoint{37., 55.})); + UNIT_ASSERT(CheckGeoPointEqual(w.GetLowerLeftCorner(), TGeoPoint{-13., -89.09540675})); + UNIT_ASSERT(CheckGeoPointEqual(w.GetUpperRightCorner(), TGeoPoint{87., 89.90907637})); + } + } + + Y_UNIT_TEST(TestCenterWhenInitWithCorners) { + UNIT_ASSERT(CheckGeoPointEqual(TGeoWindow(TGeoPoint{5.00, 40.00}, TGeoPoint{25.00, 80.00}).GetCenter(), TGeoPoint{15.00, 67.17797})); + UNIT_ASSERT(CheckGeoPointEqual(TGeoWindow(TGeoPoint{-5.00, -40.00}, TGeoPoint{-25.00, -80.00}).GetCenter(), TGeoPoint{-15.00, -67.17797})); + } + + Y_UNIT_TEST(TestCornersWhenInitWithCenter) { + // check lat calc + UNIT_ASSERT_DOUBLES_EQUAL(TGeoWindow(TGeoPoint{25.00, 50.00}, TSize{10.00, 10.00}).GetLowerLeftCorner().Lat(), 44.73927, DEFAULT_EPS); + + // lat equals to 90 + UNIT_ASSERT_DOUBLES_EQUAL(TGeoWindow(TGeoPoint{25.00, 50.00}, TSize{10.00, 179.99999}).GetUpperRightCorner().Lat(), 90, DEFAULT_EPS); + + // lat equals to -90 + UNIT_ASSERT_DOUBLES_EQUAL(TGeoWindow(TGeoPoint{25.00, -50.00}, TSize{10.00, -179.99999}).GetUpperRightCorner().Lat(), -90, DEFAULT_EPS); + + // check naive lon calc + UNIT_ASSERT_DOUBLES_EQUAL(TGeoWindow(TGeoPoint{10, 10}, TSize{10, 5}).GetLowerLeftCorner().Lon(), 5, DEFAULT_EPS); + + // check lon equals to 190 (no wrapping) + UNIT_ASSERT_DOUBLES_EQUAL(TGeoWindow(TGeoPoint{20, 0}, TSize{340, 5}).GetUpperRightCorner().Lon(), 190, DEFAULT_EPS); + + UNIT_ASSERT_DOUBLES_EQUAL(TGeoWindow(TGeoPoint{-40, 0}, TSize{-280, 5}).GetUpperRightCorner().Lon(), -180, DEFAULT_EPS); + + // naive calculating when point is (0, 0) + UNIT_ASSERT(CheckGeoPointEqual(TGeoWindow(TGeoPoint{0, 0}, TSize{160, 160}).GetLowerLeftCorner(), TGeoPoint{-80, -80}, DEFAULT_EPS)); + UNIT_ASSERT(CheckGeoPointEqual(TGeoWindow(TGeoPoint{0, 0}, TSize{160, 160}).GetUpperRightCorner(), TGeoPoint{80, 80}, DEFAULT_EPS)); + } + + Y_UNIT_TEST(TestCenterSetter) { + TGeoPoint center{27.56, 53.90}; + TGeoWindow window{}; + window.SetCenter(center); + UNIT_ASSERT_EQUAL(window.GetCenter(), center); + } + + Y_UNIT_TEST(TestEqualOperator) { + TGeoWindow window{TGeoPoint{27.56, 53.90}, TGeoPoint{30.35, 56.89}}; + UNIT_ASSERT(window == window); + + TGeoWindow anotherWindow{TGeoPoint{60.10, 57.90}, TGeoPoint{60.70, 58.25}}; + UNIT_ASSERT(!(window == anotherWindow)); + } + + Y_UNIT_TEST(TestAssignmentOperator) { + TGeoWindow lhs{TGeoPoint{27.56, 53.90}, TGeoPoint{30.35, 53.89}}; + TGeoWindow rhs{}; + rhs = lhs; + UNIT_ASSERT_EQUAL(lhs, rhs); + } + + Y_UNIT_TEST(TestContainsMethod) { + // you could see cases here https://tech.yandex.ru/maps/jsbox/2.1/rectangle + // (pay attention that the first coord is lat and the second one is lon) + TGeoWindow window{TGeoPoint{27.45, 53.82}, TGeoPoint{27.65, 53.97}}; + + // point is inside the window + UNIT_ASSERT(window.Contains(TGeoPoint{27.55, 53.90})); + + // point is to the right of the window + UNIT_ASSERT(!window.Contains(TGeoPoint{27.66, 53.95})); + + // point is to the left of the window + UNIT_ASSERT(!window.Contains(TGeoPoint{27.44, 53.95})); + + // point is under the window + UNIT_ASSERT(!window.Contains(TGeoPoint{27.50, 53.81})); + + // point is above the window + UNIT_ASSERT(!window.Contains(TGeoPoint{27.50, 53.98})); + + // point is on border + UNIT_ASSERT(window.Contains(TGeoPoint{27.45, 53.86})); + UNIT_ASSERT(window.Contains(TGeoPoint{27.65, 53.86})); + UNIT_ASSERT(window.Contains(TGeoPoint{27.55, 53.82})); + UNIT_ASSERT(window.Contains(TGeoPoint{27.55, 53.97})); + + // negate coord + UNIT_ASSERT(TGeoWindow(TGeoPoint{-72.17, -38.82}, TGeoPoint{-68.95, -36.70}).Contains(TGeoPoint{-70.40, -37.75})); + + // special cases + UNIT_ASSERT(!TGeoWindow{}.Contains(TGeoPoint{60.09, 57.90})); + + UNIT_ASSERT(TGeoWindow(TGeoPoint{}, TGeoPoint{27.55, 53.90}).Contains(TGeoPoint{27.55, 53.90})); + UNIT_ASSERT(TGeoWindow(TGeoPoint{27.55, 53.90}, TGeoPoint{}).Contains(TGeoPoint{27.55, 53.90})); + } + + Y_UNIT_TEST(TestIntersectsMethod) { + // intersect only by lat + UNIT_ASSERT( + !Intersects( + TGeoWindow{TGeoPoint{27.60, 53.90}, TGeoPoint{27.80, 53.95}}, + TGeoWindow{TGeoPoint{27.30, 53.88}, TGeoPoint{27.50, 53.98}})); + + // intersect only by lon + UNIT_ASSERT( + !Intersects( + TGeoWindow{TGeoPoint{27.35, 54}, TGeoPoint{27.45, 54.10}}, + TGeoWindow{TGeoPoint{27.30, 53.88}, TGeoPoint{27.50, 53.98}})); + + // one inside another + UNIT_ASSERT( + Intersects( + TGeoWindow{TGeoPoint{27.35, 53.90}, TGeoPoint{27.45, 53.95}}, + TGeoWindow{TGeoPoint{27.30, 53.88}, TGeoPoint{27.50, 53.98}})); + + // intersection is point + UNIT_ASSERT( + !Intersects( + TGeoWindow{TGeoPoint{27.50, 53.98}, TGeoPoint{27.70, 54.00}}, + TGeoWindow{TGeoPoint{27.30, 53.88}, TGeoPoint{27.50, 53.98}})); + + // intersection is segment + UNIT_ASSERT( + !Intersects( + TGeoWindow{TGeoPoint{27.40, 53.98}, TGeoPoint{27.70, 54.00}}, + TGeoWindow{TGeoPoint{27.30, 53.88}, TGeoPoint{27.50, 53.98}})); + + // intersection is area + UNIT_ASSERT( + Intersects( + TGeoWindow{TGeoPoint{27.40, 53.90}, TGeoPoint{27.70, 54.00}}, + TGeoWindow{TGeoPoint{27.30, 53.88}, TGeoPoint{27.50, 53.98}})); + + // equal windows + TGeoWindow window{TGeoPoint{27.60, 53.88}, TGeoPoint{27.80, 53.98}}; + UNIT_ASSERT(Intersects(window, window)); + } + + Y_UNIT_TEST(TestIntersectionMethod) { + // non-intersecting window + UNIT_ASSERT( + !(Intersection( + TGeoWindow{TGeoPoint{37.66, 55.66}, TGeoPoint{37.53, 55.64}}, + TGeoWindow{TGeoPoint{37.67, 55.66}, TGeoPoint{37.69, 55.71}}))); + + // one inside another + UNIT_ASSERT(CheckGeoWindowEqual( + Intersection( + TGeoWindow{TGeoPoint{37.00, 55.00}, TSize{10.00, 10.00}}, + TGeoWindow{TGeoPoint{37.00, 55.00}, TSize{2.00, 2.00}}) + .GetRef(), + (TGeoWindow{TGeoPoint{37.00, 55.00}, TSize{2.00, 2.00}}))); + + // cross + UNIT_ASSERT(CheckGeoWindowEqual( + Intersection( + TGeoWindow{TGeoPoint{37.00, 55.00}, TSize{10.00, 2.00}}, + TGeoWindow{TGeoPoint{37.00, 55.00}, TSize{2.00, 10.00}}) + .GetRef(), + (TGeoWindow{TGeoPoint{37.00, 55.00}, TSize{2.00, 2.00}}))); + + // intersection is a point + UNIT_ASSERT(CheckGeoWindowEqual( + Intersection( + TGeoWindow{TGeoPoint{27.50, 53.98}, TGeoPoint{27.70, 54.00}}, + TGeoWindow{TGeoPoint{27.30, 53.88}, TGeoPoint{27.50, 53.98}}) + .GetRef(), + (TGeoWindow{TGeoPoint{27.50, 53.98}, TSize{0, 0}}))); + + // intersection is a segment + UNIT_ASSERT(CheckGeoWindowEqual( + Intersection( + TGeoWindow{TGeoPoint{27.40, 53.98}, TGeoPoint{27.70, 54.00}}, + TGeoWindow{TGeoPoint{27.30, 53.88}, TGeoPoint{27.50, 53.98}}) + .GetRef(), + (TGeoWindow{TGeoPoint{27.45, 53.98}, TSize{0.10, 0}}))); + + // intersection is area + UNIT_ASSERT(CheckGeoWindowEqual( + Intersection( + TGeoWindow{TGeoPoint{27.40, 53.90}, TGeoPoint{27.70, 54.00}}, + TGeoWindow{TGeoPoint{27.30, 53.88}, TGeoPoint{27.50, 53.98}}) + .GetRef(), + (TGeoWindow{TGeoPoint{27.40, 53.90}, TGeoPoint{27.50, 53.98}}))); + + // special cases + UNIT_ASSERT( + !(Intersection( + TGeoWindow{TGeoPoint{27.30, 53.88}, TGeoPoint{27.50, 53.98}}, + TGeoWindow{}))); + } + + Y_UNIT_TEST(TestDistanceMethod) { + // one window inside another + UNIT_ASSERT_DOUBLES_EQUAL( + (TGeoWindow{TGeoPoint{27.50, 53.98}, TGeoPoint{27.80, 54.10}}) + .Distance(TGeoWindow{TGeoPoint{27.55, 54.00}, TGeoPoint{27.70, 54.07}}), + 0, + 1.E-5); + + // gap only by lon + UNIT_ASSERT_DOUBLES_EQUAL( + (TGeoWindow{TGeoPoint{27.50, 53.98}, TGeoPoint{27.60, 54.10}}) + .Distance(TGeoWindow{TGeoPoint{27.69, 54.10}, TGeoPoint{27.90, 54.20}}), + 0.052773, + 1.E-5); + + // gap only by lat + UNIT_ASSERT_DOUBLES_EQUAL( + (TGeoWindow{TGeoPoint{27.50, 53.98}, TGeoPoint{27.60, 54.10}}) + .Distance(TGeoWindow{TGeoPoint{27.50, 54.20}, TGeoPoint{27.70, 54.30}}), + 0.1, + 1.E-5); + + // gap by lot and lat, you can calculate answer using two previous tests + UNIT_ASSERT_DOUBLES_EQUAL( + (TGeoWindow{TGeoPoint{27.50, 53.98}, TGeoPoint{27.60, 54.10}} + .Distance(TGeoWindow{TGeoPoint{27.69, 54.20}, TGeoPoint{27.70, 54.30}})), + 0.11304, + 1.E-5); + + // negate coord + UNIT_ASSERT_DOUBLES_EQUAL( + (TGeoWindow{TGeoPoint{-27.50, -53.98}, TGeoPoint{-27.60, -54.10}} + .Distance(TGeoWindow{TGeoPoint{-27.69, -54.20}, TGeoPoint{-27.70, -54.30}})), + 0.11304, + 1.E-5); + } + + Y_UNIT_TEST(TestApproxDistanceMethod) { + // point inside + UNIT_ASSERT_DOUBLES_EQUAL( + (TGeoWindow{TGeoPoint{27.50, 53.98}, TGeoPoint{27.80, 54.10}}) + .GetApproxDistance(TGeoPoint{27.60, 54.05}), + 0, + 1.E-5); + + // gap only by lon + UNIT_ASSERT_DOUBLES_EQUAL( + (TGeoWindow{TGeoPoint{27.50, 54.00}, TGeoPoint{27.60, 54.10}}) + .GetApproxDistance(TGeoPoint{27.70, 54.05}), + 6535.3, + 0.1); + + // gap only by lat + UNIT_ASSERT_DOUBLES_EQUAL( + (TGeoWindow{TGeoPoint{27.50, 54.00}, TGeoPoint{27.60, 54.10}}) + .GetApproxDistance(TGeoPoint{27.55, 53.95}), + 5566.0, + 0.1); + + // gap by lot and lat + UNIT_ASSERT_DOUBLES_EQUAL( + (TGeoWindow{TGeoPoint{27.50, 54.00}, TGeoPoint{27.60, 54.10}}) + .GetApproxDistance(TGeoPoint{27.70, 54.20}), + 12900.6, + 0.1); + + // negate coord + UNIT_ASSERT_DOUBLES_EQUAL( + (TGeoWindow{TGeoPoint{-27.50, -54.00}, TGeoPoint{-27.60, -54.10}}) + .GetApproxDistance(TGeoPoint{-27.70, -54.20}), + 12900.6, + 0.1); + } + + Y_UNIT_TEST(TestUnionMethod) { + // one inside another + UNIT_ASSERT(CheckGeoWindowEqual( + Union( + TGeoWindow{TGeoPoint{37.00, 55.00}, TSize{2.00, 3.00}}, + TGeoWindow{TGeoPoint{37.10, 55.20}, TSize{1.50, 1.00}}), + TGeoWindow(TGeoPoint{37.00, 55.00}, TSize{2.00, 3.00}))); + + // non-intersecting windows + UNIT_ASSERT(CheckGeoWindowEqual( + Union( + TGeoWindow{TGeoPoint{37.00, 55.00}, TGeoPoint{37.10, 55.10}}, + TGeoWindow{TGeoPoint{37.20, 55.20}, TGeoPoint{37.30, 55.30}}), + TGeoWindow(TGeoPoint{37.00, 55.00}, TGeoPoint{37.30, 55.30}))); + + // negate coords, one inside another + UNIT_ASSERT(CheckGeoWindowEqual( + Union( + TGeoWindow{TGeoPoint{-57.62, -20.64}, TSize{2.00, 4.00}}, + TGeoWindow{TGeoPoint{-57.62, -20.64}, TSize{12.00, 10.00}}), + TGeoWindow(TGeoPoint{-57.62, -20.64}, TSize{12.00, 10.00}), 1.E-2)); + + // cross + UNIT_ASSERT(CheckGeoWindowEqual( + Union( + TGeoWindow{TGeoPoint{-3.82, 5.52}, TGeoPoint{0.10, 6.50}}, + TGeoWindow{TGeoPoint{-1.5, 4.20}, TGeoPoint{-0.5, 7.13}}), + TGeoWindow(TGeoPoint{-3.82, 4.20}, TGeoPoint{0.10, 7.13}))); + + // special cases + UNIT_ASSERT(CheckGeoWindowEqual( + Union( + TGeoWindow{TGeoPoint{-3.82, 5.52}, TGeoPoint{0.10, 6.50}}, + TGeoWindow{}), + TGeoWindow(TGeoPoint{-3.82, 5.52}, TGeoPoint{361., 181.}))); + + UNIT_ASSERT(CheckGeoWindowEqual( + Union( + TGeoWindow{}, + TGeoWindow{TGeoPoint{-3.82, 5.52}, TGeoPoint{0.10, 6.50}}), + TGeoWindow(TGeoPoint{-3.82, 5.52}, TGeoPoint{361., 181.}))); + } + + Y_UNIT_TEST(TestStretchMethod) { + TSize size{0.5, 1}; + TGeoPoint center{27.40, 53.90}; + TGeoWindow window{}; + double multiplier = 0; + + // multiplier is less than 1. + window = {center, size}; + multiplier = 0.5; + + UNIT_ASSERT(CheckGeoPointEqual(window.GetLowerLeftCorner(), TGeoPoint{27.14999, 53.39699})); + UNIT_ASSERT(CheckGeoPointEqual(window.GetUpperRightCorner(), TGeoPoint{27.65000, 54.39699})); + + window.Stretch(multiplier); + UNIT_ASSERT(CheckGeoWindowEqual(window, TGeoWindow{center, TSize{0.25, 0.5}})); + UNIT_ASSERT(CheckGeoPointEqual(window.GetLowerLeftCorner(), TGeoPoint{27.27499, 53.64925})); + UNIT_ASSERT(CheckGeoPointEqual(window.GetUpperRightCorner(), TGeoPoint{27.52500, 54.14924})); + + // multiplier is greater than 1. + window = {center, size}; + multiplier = 2.2; + + window.Stretch(multiplier); + UNIT_ASSERT(CheckGeoWindowEqual(window, TGeoWindow{center, TSize{1.1, 2.2}})); + UNIT_ASSERT(CheckGeoPointEqual(window.GetLowerLeftCorner(), TGeoPoint{26.84999, 52.78545})); + UNIT_ASSERT(CheckGeoPointEqual(window.GetUpperRightCorner(), TGeoPoint{27.95000, 54.98545})); + + // invalid multiplier + window = {center, size}; + multiplier = 100.; + + window.Stretch(multiplier); + UNIT_ASSERT(CheckGeoWindowEqual(window, TGeoWindow{center, TSize{50, 100}})); + UNIT_ASSERT(CheckGeoPointEqual(window.GetLowerLeftCorner(), TGeoPoint{2.40000, -18.88352})); + UNIT_ASSERT(CheckGeoPointEqual(window.GetUpperRightCorner(), TGeoPoint{52.39999, 81.26212})); + + // invalid multiplier + window = {center, size}; + multiplier = 0; + + window.Stretch(multiplier); + UNIT_ASSERT(CheckGeoWindowEqual(window, TGeoWindow{center, TSize{0, 0}})); + UNIT_ASSERT(CheckGeoPointEqual(window.GetLowerLeftCorner(), TGeoPoint{27.39999, 53.90000})); + UNIT_ASSERT(CheckGeoPointEqual(window.GetUpperRightCorner(), TGeoPoint{27.39999, 53.90000})); + + // invalid multiplier + window = {center, size}; + multiplier = -5.; + + window.Stretch(multiplier); + UNIT_ASSERT(CheckGeoWindowEqual(window, TGeoWindow{center, TSize{-2.5, -5}})); + UNIT_ASSERT(CheckGeoPointEqual(window.GetLowerLeftCorner(), TGeoPoint{28.64999, 56.32495})); + UNIT_ASSERT(CheckGeoPointEqual(window.GetUpperRightCorner(), TGeoPoint{26.15000, 51.32491})); + } +} + +/** + * TMercatorWindow + */ +Y_UNIT_TEST_SUITE(TMercatorWindowTest) { + Y_UNIT_TEST(TestConstructor) { + // init with two corners + TMercatorPoint lowerLeft{5, 3}; + TMercatorPoint upperRight{10, 20}; + TMercatorWindow window{lowerLeft, upperRight}; + + UNIT_ASSERT_EQUAL(window.GetWidth(), 5.); + UNIT_ASSERT_EQUAL(window.GetHeight(), 17.); + UNIT_ASSERT_EQUAL(window.GetCenter(), (TMercatorPoint{7.5, 11.5})); + + TMercatorPoint center{8, 12}; + TSize size{5, 17}; + window = {center, size}; + UNIT_ASSERT_EQUAL(window.GetUpperRightCorner().X(), 10.5); + UNIT_ASSERT_EQUAL(window.GetUpperRightCorner().Y(), 20.5); + UNIT_ASSERT_EQUAL(window.GetLowerLeftCorner().X(), 5.5); + UNIT_ASSERT_EQUAL(window.GetLowerLeftCorner().Y(), 3.5); + } + + Y_UNIT_TEST(TestInflateMethod) { + TSize size{200, 500}; + TMercatorPoint center{441, 688}; + TMercatorWindow window{}; + int add = 10; + + window = {center, size}; + UNIT_ASSERT_EQUAL(window.GetLowerLeftCorner(), TMercatorPoint(341, 438)); + UNIT_ASSERT_EQUAL(window.GetUpperRightCorner(), TMercatorPoint(541, 938)); + window.Inflate(add); + UNIT_ASSERT_EQUAL(window, TMercatorWindow(center, TSize{220, 520})); + UNIT_ASSERT_EQUAL(window.GetLowerLeftCorner(), TMercatorPoint(331, 428)); + UNIT_ASSERT_EQUAL(window.GetUpperRightCorner(), TMercatorPoint(551, 948)); + + // negate coords + center = {-441, -688}; + window = {center, size}; + UNIT_ASSERT_EQUAL(window.GetLowerLeftCorner(), TMercatorPoint(-541, -938)); + UNIT_ASSERT_EQUAL(window.GetUpperRightCorner(), TMercatorPoint(-341, -438)); + window.Inflate(add); + UNIT_ASSERT_EQUAL(window, TMercatorWindow(center, TSize{220, 520})); + UNIT_ASSERT_EQUAL(window.GetLowerLeftCorner(), TMercatorPoint(-551, -948)); + UNIT_ASSERT_EQUAL(window.GetUpperRightCorner(), TMercatorPoint(-331, -428)); + + // size becomes negate + size = {6, 12}; + center = {0, 0}; + window = {center, size}; + UNIT_ASSERT_EQUAL(window.GetLowerLeftCorner(), TMercatorPoint(-3, -6)); + UNIT_ASSERT_EQUAL(window.GetUpperRightCorner(), TMercatorPoint(3, 6)); + + add = -20; + window.Inflate(add); + UNIT_ASSERT_EQUAL(window, TMercatorWindow(center, TSize{-34, -28})); + UNIT_ASSERT_EQUAL(window.GetLowerLeftCorner(), TMercatorPoint(17, 14)); + UNIT_ASSERT_EQUAL(window.GetUpperRightCorner(), TMercatorPoint(-17, -14)); + UNIT_ASSERT_EQUAL(window.GetSize(), TSize(-34, -28)); + + // big add param + size = {10, 15}; + center = {5, 10}; + window = {center, size}; + + add = static_cast<int>(1E5); + window.Inflate(add); + UNIT_ASSERT_EQUAL(window, TMercatorWindow(center, TSize{200'010, 200'015})); + UNIT_ASSERT_EQUAL(window.GetLowerLeftCorner(), TMercatorPoint(-100'000, -99'997.5)); + UNIT_ASSERT_EQUAL(window.GetUpperRightCorner(), TMercatorPoint(100'010, 100'017.5)); + } +} diff --git a/library/cpp/geo/ut/ya.make b/library/cpp/geo/ut/ya.make new file mode 100644 index 0000000000..5bd891db1f --- /dev/null +++ b/library/cpp/geo/ut/ya.make @@ -0,0 +1,12 @@ +UNITTEST_FOR(library/cpp/geo) + +SRCS( + load_save_helper_ut.cpp + polygon_ut.cpp + point_ut.cpp + size_ut.cpp + util_ut.cpp + window_ut.cpp +) + +END() diff --git a/library/cpp/geo/util.cpp b/library/cpp/geo/util.cpp new file mode 100644 index 0000000000..e8d0fc378e --- /dev/null +++ b/library/cpp/geo/util.cpp @@ -0,0 +1,34 @@ +#include "util.h" + +#include <math.h> +#include <util/generic/cast.h> +#include <util/generic/string.h> +#include <util/string/cast.h> +#include <utility> + +namespace NGeo { + bool TryPairFromString(std::pair<double, double>& res, TStringBuf inputStr, TStringBuf delimiter) { + TStringBuf lhsStr; + TStringBuf rhsStr; + + double lhs = NAN; + double rhs = NAN; + if ( + !inputStr.TrySplit(delimiter, lhsStr, rhsStr) || + !TryFromString<double>(lhsStr, lhs) || + !TryFromString<double>(rhsStr, rhs)) { + return false; + } + + res = {lhs, rhs}; + return true; + } + + std::pair<double, double> PairFromString(TStringBuf inputStr, TStringBuf delimiter) { + std::pair<double, double> res; + if (!TryPairFromString(res, inputStr, delimiter)) { + ythrow TBadCastException() << "Wrong point string: " << inputStr; + } + return res; + } +} // namespace NGeo diff --git a/library/cpp/geo/util.h b/library/cpp/geo/util.h new file mode 100644 index 0000000000..18b411e6a4 --- /dev/null +++ b/library/cpp/geo/util.h @@ -0,0 +1,107 @@ +#pragma once + +#include "point.h" +#include "size.h" +#include "window.h" + +#include <util/generic/ymath.h> + +namespace NGeo { + constexpr double MIN_LATITUDE = -90.; + constexpr double MAX_LATITUDE = +90.; + constexpr double MIN_LONGITUDE = -180.; + constexpr double MAX_LONGITUDE = +180.; + constexpr double WORLD_WIDTH = MAX_LONGITUDE - MIN_LONGITUDE; + constexpr double WORLD_HEIGHT = MAX_LATITUDE - MIN_LATITUDE; + + // The Mercator projection is truncated at certain latitude so that the visible world forms a square. The poles are not shown. + constexpr double VISIBLE_LATITUDE_BOUND = 85.084059050109785; + + inline double Deg2rad(double d) { + return d * PI / 180; + } + + inline double Rad2deg(double d) { + return d * 180 / PI; + } + + inline double GetLongitudeFromMetersAtEquator(double meters) { + return Rad2deg(meters * (1. / WGS84::R)); + } + + inline double GetMetersFromDeg(double angle) { + return Deg2rad(angle) * NGeo::WGS84::R; + } + + inline double GetLatCos(double latDegree) { + return cos(Deg2rad(latDegree)); + } + + /** + * Get Inversed cosinus of latitude + * It is more precise, than division of two big doubles + * It is safe for lattitue at 90 degrees + */ + inline double GetInversedLatCosSafe(double latDegree) { + return 1. / Max(0.001, cos(Deg2rad(latDegree))); + } + + /** + * Gets Lontitude width for given width at equator and latitude + */ + inline double GetWidthAtLatitude(double widthEquator, double latDegree) { + return widthEquator * GetInversedLatCosSafe(latDegree); + } + + inline double GetWidthAtLatitude(double widthEquator, const TGeoPoint& p) { + return GetWidthAtLatitude(widthEquator, p.Lat()); + } + + /* + * Returns Normalised width at equator for specified width at latitude and latitude + */ + + inline double GetWidthAtEquator(double widthAtLatitude, double latDegree) { + return widthAtLatitude * GetLatCos(latDegree); + } + + inline double GetWidthAtEquator(double widthAtLatitude, const TGeoPoint& p) { + return GetWidthAtEquator(widthAtLatitude, p.Lat()); + } + + /* + * Same for size + */ + + inline TSize GetSizeAtLatitude(const TSize& sizeAtEquator, const TGeoPoint& at) { + return TSize(GetWidthAtLatitude(sizeAtEquator.GetWidth(), at), sizeAtEquator.GetHeight()); + } + + inline TSize GetSizeAtEquator(const TSize& sizeAtLatitude, const TGeoPoint& at) { + return TSize(GetWidthAtEquator(sizeAtLatitude.GetWidth(), at), sizeAtLatitude.GetHeight()); + } + + inline TGeoWindow ConstructWindowFromEquatorSize(const TGeoPoint& center, const TSize& sizeAtEquator) { + return TGeoWindow(center, GetSizeAtLatitude(sizeAtEquator, center)); + } + + inline double SquaredDiagonal(const NGeo::TSize& size, double latitude) { + return Sqr(NGeo::GetWidthAtEquator(size.GetWidth(), latitude)) + Sqr(size.GetHeight()); + } + + inline double Diagonal(const NGeo::TSize& size, double latitude) { + return sqrt(SquaredDiagonal(size, latitude)); + } + + /** + * try to parse two coords from string + * return pair of coords on success, otherwise throw exception + */ + std::pair<double, double> PairFromString(TStringBuf inputStr, TStringBuf delimiter = TStringBuf(",")); + + /** + * try to parse two coords from string + * write result to first param and return true on success, otherwise return false + */ + bool TryPairFromString(std::pair<double, double>& res, TStringBuf inputStr, TStringBuf delimiter = TStringBuf(",")); +} // namespace NGeo diff --git a/library/cpp/geo/window.cpp b/library/cpp/geo/window.cpp new file mode 100644 index 0000000000..2ad2b61b71 --- /dev/null +++ b/library/cpp/geo/window.cpp @@ -0,0 +1,297 @@ +#include "window.h" + +#include "util.h" + +#include <util/generic/ylimits.h> +#include <util/generic/ymath.h> +#include <util/generic/maybe.h> + +#include <cstdlib> +#include <utility> + +namespace NGeo { + namespace { + TMercatorPoint GetMiddlePoint(const TMercatorPoint& p1, const TMercatorPoint& p2) { + return TMercatorPoint{(p1.X() + p2.X()) / 2, (p1.Y() + p2.Y()) / 2}; + } + + struct TLatBounds { + double LatMin; + double LatMax; + }; + } // namespace + + bool TrySpan2LatitudeDegenerateCases(double ll, double lspan, TLatBounds& result) { + // TODO(sobols@): Compare with eps? + if (Y_UNLIKELY(lspan >= 180.)) { + result.LatMin = -90.; + result.LatMax = +90.; + return true; + } + if (Y_UNLIKELY(ll == +90.)) { + result.LatMin = ll - lspan; + result.LatMax = ll; + return true; + } + if (Y_UNLIKELY(ll == -90.)) { + result.LatMin = ll; + result.LatMax = ll + lspan; + return true; + } + return false; + } + + /** + * Finds such latitudes lmin, lmax that: + * 1) lmin <= ll <= lmax, + * 2) lmax - lmin == lspan, + * 3) MercatorY(ll) - MercatorY(lmin) == MercatorY(lmax) - MercatorY(ll) + * (the ll parallel is a center between lmin and lmax parallels in Mercator projection) + * + * \returns a pair (lmin, lmax) + */ + TLatBounds Span2Latitude(double ll, double lspan) { + TLatBounds result{}; + if (TrySpan2LatitudeDegenerateCases(ll, lspan, result)) { + return result; + } + + const double lc = Deg2rad(ll); + const double h = Deg2rad(lspan); + + // Spherical (Pseudo) Mercator: + // MercatorY(lc) = R * ln(tan(lc / 2 + PI / 4)). + // Note that + // ln(a) - ln(b) = ln(a / b) + // That'a why + // MercatorY(lc) - MercatorY(lmin) == MercatorY(lmin + h) - MercatorY(lc) <=> + // <=> tan(lc / 2 + PI / 4) / tan(lmin / 2 + PI / 4) == + // == tan(lmin / 2 + h / 2 + PI / 4) / tan(lc / 2 + PI / 4). + // Also note that + // tan(x + y) == (tan(x) + tan(y)) / (1 - tan(x) * tan(y)), + // so + // tan(lmin / 2 + h / 2 + PI / 4) == + // == (tan(lmin / 2 + PI / 4) + tan(h / 2)) / (1 - tan(lmin / 2 + PI / 4) * tan(h / 2)) + + const double yx = tan(lc / 2 + PI / 4); + + // Let x be tan(lmin / 2 + PI / 4), + // then + // yx / x == (x + tan(h / 2)) / ((1 - x * tan(h / 2)) * yx), + // or + // yx^2 * (1 - x * tan(h / 2)) == (x + tan(h / 2)) * x. + // Now we solve a quadratic equation: + // x^2 + bx + c == 0 + + const double C = yx * yx; + + const double b = (C + 1) * tan(h / 2), c = -C; + const double D = b * b - 4 * c; + const double root = (-b + sqrt(D)) / 2; + + result.LatMin = Rad2deg((atan(root) - PI / 4) * 2); + result.LatMax = result.LatMin + lspan; + return result; + } + + void TGeoWindow::CalcCorners() { + if (!IsValid()) { + return; + } + const TLatBounds latBounds = Span2Latitude(Center_.Lat(), Size_.GetHeight()); + + if (-90. < latBounds.LatMin && latBounds.LatMax < +90.) { + TMercatorPoint lowerLeftCornerM = LLToMercator(TGeoPoint(Center_.Lon() - (Size_.GetWidth() / 2), latBounds.LatMin)); + TMercatorPoint upperRightCornerM = LLToMercator(TGeoPoint(Center_.Lon() + (Size_.GetWidth() / 2), latBounds.LatMax)); + TMercatorPoint centerM = LLToMercator(Center_); + + double w = upperRightCornerM.X() - lowerLeftCornerM.X(); + double h = upperRightCornerM.Y() - lowerLeftCornerM.Y(); + + LowerLeftCorner_ = MercatorToLL(TMercatorPoint(centerM.X() - w / 2, centerM.Y() - h / 2)); + UpperRightCorner_ = MercatorToLL(TMercatorPoint(centerM.X() + w / 2, centerM.Y() + h / 2)); + } else { + LowerLeftCorner_ = TGeoPoint(Center_.Lon() - (Size_.GetWidth() / 2), latBounds.LatMin); + UpperRightCorner_ = TGeoPoint(Center_.Lon() + (Size_.GetWidth() / 2), latBounds.LatMax); + } + } + + void TGeoWindow::CalcCenterAndSpan() { + if (!LowerLeftCorner_ || !UpperRightCorner_) { + return; + } + + TMercatorPoint lower = LLToMercator(LowerLeftCorner_); + TMercatorPoint upper = LLToMercator(UpperRightCorner_); + TMercatorPoint center = GetMiddlePoint(lower, upper); + Center_ = MercatorToLL(center); + + Size_ = TSize(UpperRightCorner_.Lon() - LowerLeftCorner_.Lon(), + UpperRightCorner_.Lat() - LowerLeftCorner_.Lat()); + } + + bool TGeoWindow::Contains(const TGeoPoint& p) const { + return LowerLeftCorner_.Lon() <= p.Lon() && p.Lon() <= UpperRightCorner_.Lon() && + LowerLeftCorner_.Lat() <= p.Lat() && p.Lat() <= UpperRightCorner_.Lat(); + } + + double TGeoWindow::Diameter() const { + return Diagonal(Size_, Center_.Lat()); + } + + double TGeoWindow::Distance(const TGeoWindow& w) const { + const double minX = Max(GetLowerLeftCorner().Lon(), w.GetLowerLeftCorner().Lon()); + const double maxX = Min(GetUpperRightCorner().Lon(), w.GetUpperRightCorner().Lon()); + const double minY = Max(GetLowerLeftCorner().Lat(), w.GetLowerLeftCorner().Lat()); + const double maxY = Min(GetUpperRightCorner().Lat(), w.GetUpperRightCorner().Lat()); + double xGap = minX > maxX ? (minX - maxX) : 0.; + double yGap = minY > maxY ? (minY - maxY) : 0.; + return sqrtf(Sqr(xGap * cos((minY + maxY) * 0.5 * PI / 180)) + Sqr(yGap)); + } + + double TWindowLL::GetApproxDistance(const TPointLL& point) const { + const double metresInDegree = WGS84::R * PI / 180; + return Distance(TWindowLL{point, point}) * metresInDegree; + } + + TGeoWindow TGeoWindow::ParseFromCornersPoints(TStringBuf leftCornerStr, TStringBuf rightCornerStr, TStringBuf delimiter) { + auto leftCorner = TGeoPoint::Parse(leftCornerStr, delimiter); + auto rightCorner = TGeoPoint::Parse(rightCornerStr, delimiter); + + return {leftCorner, rightCorner}; + } + + TMaybe<TGeoWindow> TGeoWindow::TryParseFromCornersPoints(TStringBuf leftCornerStr, TStringBuf rightCornerStr, TStringBuf delimiter) { + auto leftCorner = TGeoPoint::TryParse(leftCornerStr, delimiter); + auto rightCorner = TGeoPoint::TryParse(rightCornerStr, delimiter); + if (!leftCorner || !rightCorner) { + return {}; + } + + return TGeoWindow{*leftCorner, *rightCorner}; + } + + TGeoWindow TGeoWindow::ParseFromLlAndSpn(TStringBuf llStr, TStringBuf spnStr, TStringBuf delimiter) { + TGeoPoint ll = TGeoPoint::Parse(llStr, delimiter); + TSize spn = TSize::Parse(spnStr, delimiter); + + return {ll, spn}; + } + + TMaybe<TGeoWindow> TGeoWindow::TryParseFromLlAndSpn(TStringBuf llStr, TStringBuf spnStr, TStringBuf delimiter) { + auto ll = TGeoPoint::TryParse(llStr, delimiter); + auto spn = TSize::TryParse(spnStr, delimiter); + + if (!ll || !spn) { + return {}; + } + + return TGeoWindow{*ll, *spn}; + } + /** + * TMercatorWindow + */ + + TMercatorWindow::TMercatorWindow() noexcept + : HalfWidth_{std::numeric_limits<double>::quiet_NaN()} + , HalfHeight_{std::numeric_limits<double>::quiet_NaN()} + { + } + + TMercatorWindow::TMercatorWindow(const TMercatorPoint& center, const TSize& size) noexcept + : Center_{center} + , HalfWidth_{size.GetWidth() / 2} + , HalfHeight_{size.GetHeight() / 2} + { + } + + TMercatorWindow::TMercatorWindow(const TMercatorPoint& firstPoint, const TMercatorPoint& secondPoint) noexcept + : Center_{GetMiddlePoint(firstPoint, secondPoint)} + , HalfWidth_{Abs(secondPoint.X() - firstPoint.X()) / 2} + , HalfHeight_{Abs(secondPoint.Y() - firstPoint.Y()) / 2} + { + } + + bool TMercatorWindow::Contains(const TMercatorPoint& pt) const noexcept { + return (Center_.X() - HalfWidth_ <= pt.X()) && + (pt.X() <= Center_.X() + HalfWidth_) && + (Center_.Y() - HalfHeight_ <= pt.Y()) && + (pt.Y() <= Center_.Y() + HalfHeight_); + } + + /** + * Conversion + */ + + TMercatorWindow LLToMercator(const TGeoWindow& window) { + return TMercatorWindow{LLToMercator(window.GetLowerLeftCorner()), LLToMercator(window.GetUpperRightCorner())}; + } + + TGeoWindow MercatorToLL(const TMercatorWindow& window) { + return TGeoWindow{MercatorToLL(window.GetLowerLeftCorner()), MercatorToLL(window.GetUpperRightCorner())}; + } + + /** + * Operators + */ + + TMaybe<TGeoWindow> Intersection(const TGeoWindow& lhs, const TGeoWindow& rhs) { + const double minX = Max(lhs.GetLowerLeftCorner().Lon(), rhs.GetLowerLeftCorner().Lon()); + const double maxX = Min(lhs.GetUpperRightCorner().Lon(), rhs.GetUpperRightCorner().Lon()); + const double minY = Max(lhs.GetLowerLeftCorner().Lat(), rhs.GetLowerLeftCorner().Lat()); + const double maxY = Min(lhs.GetUpperRightCorner().Lat(), rhs.GetUpperRightCorner().Lat()); + if (minX > maxX || minY > maxY) { + return {}; + } + return TGeoWindow(TGeoPoint(minX, minY), TGeoPoint(maxX, maxY)); + } + + TMaybe<TGeoWindow> Intersection(const TMaybe<TGeoWindow>& lhs, const TMaybe<TGeoWindow>& rhs) { + if (!lhs || !rhs) { + return {}; + } + return Intersection(*lhs, *rhs); + } + + TGeoWindow Union(const TGeoWindow& lhs, const TGeoWindow& rhs) { + const double minX = Min(lhs.GetLowerLeftCorner().Lon(), rhs.GetLowerLeftCorner().Lon()); + const double maxX = Max(lhs.GetUpperRightCorner().Lon(), rhs.GetUpperRightCorner().Lon()); + const double minY = Min(lhs.GetLowerLeftCorner().Lat(), rhs.GetLowerLeftCorner().Lat()); + const double maxY = Max(lhs.GetUpperRightCorner().Lat(), rhs.GetUpperRightCorner().Lat()); + return TGeoWindow{TGeoPoint{minX, minY}, TGeoPoint{maxX, maxY}}; + } + + TMaybe<TGeoWindow> Union(const TMaybe<TGeoWindow>& lhs, const TMaybe<TGeoWindow>& rhs) { + if (!lhs) { + return rhs; + } + if (!rhs) { + return lhs; + } + return Union(*lhs, *rhs); + } + + bool Contains(const TMaybe<TGeoWindow>& window, const TGeoPoint& point) { + if (!window) { + return false; + } + return window.GetRef().Contains(point); + } + + bool Intersects(const TGeoWindow& lhs, const TGeoWindow& rhs) { + bool haveHorizIntersection = + !(lhs.GetUpperRightCorner().Lon() <= rhs.GetLowerLeftCorner().Lon() || + rhs.GetUpperRightCorner().Lon() <= lhs.GetLowerLeftCorner().Lon()); + bool haveVertIntersection = + !(lhs.GetUpperRightCorner().Lat() <= rhs.GetLowerLeftCorner().Lat() || + rhs.GetUpperRightCorner().Lat() <= lhs.GetLowerLeftCorner().Lat()); + return haveHorizIntersection && haveVertIntersection; + } + + bool Intersects(const TMaybe<TGeoWindow>& lhs, const TMaybe<TGeoWindow>& rhs) { + if (!lhs || !rhs) { + return false; + } + return Intersects(*lhs, *rhs); + } +} // namespace NGeo diff --git a/library/cpp/geo/window.h b/library/cpp/geo/window.h new file mode 100644 index 0000000000..1205d8351b --- /dev/null +++ b/library/cpp/geo/window.h @@ -0,0 +1,264 @@ +#pragma once + +#include "point.h" +#include "size.h" +#include <util/generic/string.h> +#include <util/generic/yexception.h> +#include <util/string/cast.h> +#include <util/generic/maybe.h> + +#include <algorithm> + +namespace NGeo { + class TGeoWindow { + public: + TGeoWindow() noexcept + + = default; + + TGeoWindow(const TGeoPoint& center, const TSize& size) noexcept + : Center_(center) + , Size_(size) + { + CalcCorners(); + } + + TGeoWindow(const TGeoPoint& firstPoint, const TGeoPoint& secondPoint) noexcept + : LowerLeftCorner_{std::min(firstPoint.Lon(), secondPoint.Lon()), + std::min(firstPoint.Lat(), secondPoint.Lat())} + , UpperRightCorner_{std::max(firstPoint.Lon(), secondPoint.Lon()), + std::max(firstPoint.Lat(), secondPoint.Lat())} + { + CalcCenterAndSpan(); + } + + const TGeoPoint& GetCenter() const noexcept { + return Center_; + } + + void SetCenter(const TGeoPoint& newCenter) { + Center_ = newCenter; + CalcCorners(); + } + + const TSize& GetSize() const noexcept { + return Size_; + } + + void SetSize(const TSize& newSize) { + Size_ = newSize; + CalcCorners(); + } + + const TGeoPoint& GetLowerLeftCorner() const noexcept { + return LowerLeftCorner_; + } + + const TGeoPoint& GetUpperRightCorner() const noexcept { + return UpperRightCorner_; + } + + void swap(TGeoWindow& o) noexcept { + Center_.swap(o.Center_); + Size_.swap(o.Size_); + LowerLeftCorner_.swap(o.LowerLeftCorner_); + UpperRightCorner_.swap(o.UpperRightCorner_); + } + + bool IsValid() const noexcept { + return Center_.IsValid() && Size_.IsValid(); + } + + bool Contains(const TGeoPoint&) const; + + bool Contains(const TGeoWindow& w) const { + return Contains(w.LowerLeftCorner_) && Contains(w.UpperRightCorner_); + } + + void Stretch(double multiplier) { + Size_.Stretch(multiplier); + CalcCorners(); + } + + void Inflate(double additionX, double additionY) { + Size_.Inflate(additionX * 2, additionY * 2); + CalcCorners(); + } + + void Inflate(double addition) { + Inflate(addition, addition); + } + + bool operator!() const { + return !IsValid(); + } + + double Diameter() const; + + double Area() const { + return Size_.GetHeight() * Size_.GetWidth(); + } + + double Distance(const TGeoWindow&) const; + + double GetApproxDistance(const TPointLL& point) const; + + /** + * try to parse TGeoWindow from center and span + * return parsed TGeoWindow on success, otherwise throw exception + */ + static TGeoWindow ParseFromLlAndSpn(TStringBuf llStr, TStringBuf spnStr, TStringBuf delimiter = TStringBuf(",")); + + /** + * try to parse TGeoWindow from two corners + * return parsed TGeoWindow on success, otherwise throw exception + */ + static TGeoWindow ParseFromCornersPoints(TStringBuf leftCornerStr, TStringBuf rightCornerStr, TStringBuf delimiter = TStringBuf(",")); + + /** + * try to parse TGeoWindow from center and span + * return TMaybe of parsed TGeoWindow on success, otherwise return empty TMaybe + */ + static TMaybe<TGeoWindow> TryParseFromLlAndSpn(TStringBuf llStr, TStringBuf spnStr, TStringBuf delimiter = TStringBuf(",")); + + /** + * try to parse TGeoWindow from two corners + * return TMaybe of parsed TGeoWindow on success, otherwise return empty TMaybe + */ + static TMaybe<TGeoWindow> TryParseFromCornersPoints(TStringBuf leftCornerStr, TStringBuf rightCornerStr, TStringBuf delimiter = TStringBuf(",")); + + private: + TGeoPoint Center_; + TSize Size_; + TGeoPoint LowerLeftCorner_; + TGeoPoint UpperRightCorner_; + + void CalcCorners(); + void CalcCenterAndSpan(); + }; + + inline bool operator==(const TGeoWindow& lhs, const TGeoWindow& rhs) { + return lhs.GetCenter() == rhs.GetCenter() && lhs.GetSize() == rhs.GetSize(); + } + + inline bool operator!=(const TGeoWindow& p1, const TGeoWindow& p2) { + return !(p1 == p2); + } + + /** + * \class TMercatorWindow + * + * Represents a window in EPSG:3395 projection + * (WGS 84 / World Mercator) + */ + class TMercatorWindow { + public: + TMercatorWindow() noexcept; + TMercatorWindow(const TMercatorPoint& center, const TSize& size) noexcept; + TMercatorWindow(const TMercatorPoint& firstPoint, const TMercatorPoint& secondPoint) noexcept; + + const TMercatorPoint& GetCenter() const noexcept { + return Center_; + } + + TSize GetHalfSize() const noexcept { + return {HalfWidth_, HalfHeight_}; + } + + TSize GetSize() const noexcept { + return {GetWidth(), GetHeight()}; + } + + double GetWidth() const noexcept { + return HalfWidth_ * 2; + } + + double GetHeight() const noexcept { + return HalfHeight_ * 2; + } + + TMercatorPoint GetLowerLeftCorner() const noexcept { + return TMercatorPoint{Center_.X() - HalfWidth_, Center_.Y() - HalfHeight_}; + } + + TMercatorPoint GetUpperRightCorner() const noexcept { + return TMercatorPoint{Center_.X() + HalfWidth_, Center_.Y() + HalfHeight_}; + } + + bool Contains(const TMercatorPoint& pt) const noexcept; + + bool Contains(const TMercatorWindow& w) const { + return Contains(w.GetLowerLeftCorner()) && Contains(w.GetUpperRightCorner()); + } + + void Stretch(double multiplier) { + HalfWidth_ *= multiplier; + HalfHeight_ *= multiplier; + } + + void Inflate(double additionX, double additionY) { + HalfWidth_ += additionX; + HalfHeight_ += additionY; + } + + void Inflate(double addition) { + Inflate(addition, addition); + } + + double Area() const { + return GetHeight() * GetWidth(); + } + + private: + bool IsDefined() const { + return Center_.IsDefined() && !std::isnan(HalfWidth_) && !std::isnan(HalfHeight_); + } + + private: + TMercatorPoint Center_; + double HalfWidth_; + double HalfHeight_; + }; + + inline bool operator==(const TMercatorWindow& lhs, const TMercatorWindow& rhs) { + return lhs.GetCenter() == rhs.GetCenter() && lhs.GetHalfSize() == rhs.GetHalfSize(); + } + + inline bool operator!=(const TMercatorWindow& p1, const TMercatorWindow& p2) { + return !(p1 == p2); + } + + /** + * Typedefs + * TODO(sobols@): remove + */ + + using TWindowLL = TGeoWindow; + + /** + * Conversion + */ + + TMercatorWindow LLToMercator(const TGeoWindow&); + TGeoWindow MercatorToLL(const TMercatorWindow&); + + /** + * Utility functions + */ + + bool Contains(const TMaybe<TGeoWindow>& window, const TGeoPoint& point); + + TMaybe<TGeoWindow> Union(const TMaybe<TGeoWindow>& lhs, const TMaybe<TGeoWindow>& rhs); + TGeoWindow Union(const TGeoWindow& lhs, const TGeoWindow& rhs); + + TMaybe<TGeoWindow> Intersection(const TMaybe<TGeoWindow>& lhs, const TMaybe<TGeoWindow>& rhs); + TMaybe<TGeoWindow> Intersection(const TGeoWindow& lhs, const TGeoWindow& rhs); + + bool Intersects(const TGeoWindow& lhs, const TGeoWindow& rhs); + bool Intersects(const TMaybe<TGeoWindow>& lhs, const TMaybe<TGeoWindow>& rhs); +} // namespace NGeo + +template <> +inline void Out<NGeo::TGeoWindow>(IOutputStream& o, const NGeo::TGeoWindow& obj) { + o << '{' << obj.GetCenter() << ", " << obj.GetSize() << ", " << obj.GetLowerLeftCorner() << ", " << obj.GetUpperRightCorner() << "}"; +} diff --git a/library/cpp/geo/ya.make b/library/cpp/geo/ya.make new file mode 100644 index 0000000000..1d36003c5c --- /dev/null +++ b/library/cpp/geo/ya.make @@ -0,0 +1,19 @@ +LIBRARY() + +SRCS( + bbox.cpp + geo.cpp + point.cpp + polygon.cpp + load_save_helper.cpp + size.cpp + util.cpp + window.cpp +) + +END() + +RECURSE_FOR_TESTS( + ut + style + ) diff --git a/library/cpp/geobase/CMakeLists.darwin-x86_64.txt b/library/cpp/geobase/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..b316e54e8a --- /dev/null +++ b/library/cpp/geobase/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,30 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +get_built_tool_path( + TOOL_enum_parser_bin + TOOL_enum_parser_dependency + tools/enum_parser/enum_parser + enum_parser +) + +add_library(library-cpp-geobase) +target_link_libraries(library-cpp-geobase PUBLIC + contrib-libs-cxxsupp + yutil + geobase-library + tools-enum_parser-enum_serialization_runtime +) +target_sources(library-cpp-geobase PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/geobase/geobase.cpp +) +generate_enum_serilization(library-cpp-geobase + ${CMAKE_SOURCE_DIR}/geobase/include/structs.hpp + INCLUDE_HEADERS + geobase/include/structs.hpp +) diff --git a/library/cpp/geobase/CMakeLists.linux-aarch64.txt b/library/cpp/geobase/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..ab3962970d --- /dev/null +++ b/library/cpp/geobase/CMakeLists.linux-aarch64.txt @@ -0,0 +1,31 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +get_built_tool_path( + TOOL_enum_parser_bin + TOOL_enum_parser_dependency + tools/enum_parser/enum_parser + enum_parser +) + +add_library(library-cpp-geobase) +target_link_libraries(library-cpp-geobase PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + geobase-library + tools-enum_parser-enum_serialization_runtime +) +target_sources(library-cpp-geobase PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/geobase/geobase.cpp +) +generate_enum_serilization(library-cpp-geobase + ${CMAKE_SOURCE_DIR}/geobase/include/structs.hpp + INCLUDE_HEADERS + geobase/include/structs.hpp +) diff --git a/library/cpp/geobase/CMakeLists.linux-x86_64.txt b/library/cpp/geobase/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..ab3962970d --- /dev/null +++ b/library/cpp/geobase/CMakeLists.linux-x86_64.txt @@ -0,0 +1,31 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +get_built_tool_path( + TOOL_enum_parser_bin + TOOL_enum_parser_dependency + tools/enum_parser/enum_parser + enum_parser +) + +add_library(library-cpp-geobase) +target_link_libraries(library-cpp-geobase PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + geobase-library + tools-enum_parser-enum_serialization_runtime +) +target_sources(library-cpp-geobase PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/geobase/geobase.cpp +) +generate_enum_serilization(library-cpp-geobase + ${CMAKE_SOURCE_DIR}/geobase/include/structs.hpp + INCLUDE_HEADERS + geobase/include/structs.hpp +) diff --git a/library/cpp/geobase/CMakeLists.txt b/library/cpp/geobase/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/library/cpp/geobase/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/library/cpp/geobase/CMakeLists.windows-x86_64.txt b/library/cpp/geobase/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..b316e54e8a --- /dev/null +++ b/library/cpp/geobase/CMakeLists.windows-x86_64.txt @@ -0,0 +1,30 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +get_built_tool_path( + TOOL_enum_parser_bin + TOOL_enum_parser_dependency + tools/enum_parser/enum_parser + enum_parser +) + +add_library(library-cpp-geobase) +target_link_libraries(library-cpp-geobase PUBLIC + contrib-libs-cxxsupp + yutil + geobase-library + tools-enum_parser-enum_serialization_runtime +) +target_sources(library-cpp-geobase PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/geobase/geobase.cpp +) +generate_enum_serilization(library-cpp-geobase + ${CMAKE_SOURCE_DIR}/geobase/include/structs.hpp + INCLUDE_HEADERS + geobase/include/structs.hpp +) diff --git a/library/cpp/geobase/geobase.cpp b/library/cpp/geobase/geobase.cpp new file mode 100644 index 0000000000..24086c67a9 --- /dev/null +++ b/library/cpp/geobase/geobase.cpp @@ -0,0 +1,3 @@ +#include <library/cpp/geobase/lookup.hpp> +#include <library/cpp/geobase/timezone_getter.hpp> +#include <library/cpp/geobase/service_getter.hpp> diff --git a/library/cpp/geobase/lookup.hpp b/library/cpp/geobase/lookup.hpp new file mode 100644 index 0000000000..f663750ab2 --- /dev/null +++ b/library/cpp/geobase/lookup.hpp @@ -0,0 +1,44 @@ +#pragma once + +#include <geobase/include/lookup.hpp> +#include <geobase/include/lookup_wrapper.hpp> +#include <geobase/include/structs.hpp> + +namespace NGeobase { + using TInitTraits = NImpl::TLookup::TInitTraits; + + class TLookup: public NImpl::TLookup { + public: + using parent = NImpl::TLookup; + + explicit TLookup(const std::string& datafile, const TInitTraits traits = {}) + : parent(datafile, traits) + { + } + explicit TLookup(const TInitTraits traits) + : parent(traits) + { + } + explicit TLookup(const void* pData, size_t len) + : parent(pData, len) + { + } + + ~TLookup() { + } + }; + + using TRegion = NImpl::TRegion; + using TGeolocation = NImpl::TGeolocation; + using TLinguistics = NImpl::TLinguistics; + using TGeoPoint = NImpl::TGeoPoint; + + using TLookupWrapper = NImpl::TLookupWrapper; + + using TId = NImpl::Id; + using TIdsList = NImpl::IdsList; + using TRegionsList = NImpl::TRegionsList; + + using TIpBasicTraits = NImpl::TIpBasicTraits; + using TIpTraits = NImpl::TIpTraits; +} diff --git a/library/cpp/geobase/service_getter.hpp b/library/cpp/geobase/service_getter.hpp new file mode 100644 index 0000000000..e088081706 --- /dev/null +++ b/library/cpp/geobase/service_getter.hpp @@ -0,0 +1,7 @@ +#pragma once + +#include <geobase/include/service_getter.hpp> + +namespace NGeobase { + using TServiceGetter = NImpl::TServiceGetter; +} diff --git a/library/cpp/geobase/timezone_getter.hpp b/library/cpp/geobase/timezone_getter.hpp new file mode 100644 index 0000000000..5749f1e3d6 --- /dev/null +++ b/library/cpp/geobase/timezone_getter.hpp @@ -0,0 +1,9 @@ +#pragma once + +#include <geobase/include/timezone_getter.hpp> +#include <geobase/include/structs.hpp> + +namespace NGeobase { + using TTimezone = NImpl::TTimezone; + using TTimezoneGetter = NImpl::TTimezoneGetter; +} diff --git a/library/cpp/geobase/ya.make b/library/cpp/geobase/ya.make new file mode 100644 index 0000000000..4a73974903 --- /dev/null +++ b/library/cpp/geobase/ya.make @@ -0,0 +1,13 @@ +LIBRARY() + +SRCS( + library/cpp/geobase/geobase.cpp +) + +PEERDIR( + geobase/library +) + +GENERATE_ENUM_SERIALIZATION(geobase/include/structs.hpp) + +END() diff --git a/library/cpp/geohash/CMakeLists.darwin-x86_64.txt b/library/cpp/geohash/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..dfcb278a1f --- /dev/null +++ b/library/cpp/geohash/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,32 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +get_built_tool_path( + TOOL_enum_parser_bin + TOOL_enum_parser_dependency + tools/enum_parser/enum_parser + enum_parser +) + +add_library(library-cpp-geohash) +target_link_libraries(library-cpp-geohash PUBLIC + contrib-libs-cxxsupp + yutil + library-cpp-geo + tools-enum_parser-enum_serialization_runtime +) +target_sources(library-cpp-geohash PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/geohash/geohash.cpp +) +generate_enum_serilization(library-cpp-geohash + ${CMAKE_SOURCE_DIR}/library/cpp/geohash/direction.h + GEN_HEADER + ${CMAKE_BINARY_DIR}/library/cpp/geohash/direction.h_serialized.h + INCLUDE_HEADERS + library/cpp/geohash/direction.h +) diff --git a/library/cpp/geohash/CMakeLists.linux-aarch64.txt b/library/cpp/geohash/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..a907311df0 --- /dev/null +++ b/library/cpp/geohash/CMakeLists.linux-aarch64.txt @@ -0,0 +1,33 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +get_built_tool_path( + TOOL_enum_parser_bin + TOOL_enum_parser_dependency + tools/enum_parser/enum_parser + enum_parser +) + +add_library(library-cpp-geohash) +target_link_libraries(library-cpp-geohash PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + library-cpp-geo + tools-enum_parser-enum_serialization_runtime +) +target_sources(library-cpp-geohash PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/geohash/geohash.cpp +) +generate_enum_serilization(library-cpp-geohash + ${CMAKE_SOURCE_DIR}/library/cpp/geohash/direction.h + GEN_HEADER + ${CMAKE_BINARY_DIR}/library/cpp/geohash/direction.h_serialized.h + INCLUDE_HEADERS + library/cpp/geohash/direction.h +) diff --git a/library/cpp/geohash/CMakeLists.linux-x86_64.txt b/library/cpp/geohash/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..a907311df0 --- /dev/null +++ b/library/cpp/geohash/CMakeLists.linux-x86_64.txt @@ -0,0 +1,33 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +get_built_tool_path( + TOOL_enum_parser_bin + TOOL_enum_parser_dependency + tools/enum_parser/enum_parser + enum_parser +) + +add_library(library-cpp-geohash) +target_link_libraries(library-cpp-geohash PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + library-cpp-geo + tools-enum_parser-enum_serialization_runtime +) +target_sources(library-cpp-geohash PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/geohash/geohash.cpp +) +generate_enum_serilization(library-cpp-geohash + ${CMAKE_SOURCE_DIR}/library/cpp/geohash/direction.h + GEN_HEADER + ${CMAKE_BINARY_DIR}/library/cpp/geohash/direction.h_serialized.h + INCLUDE_HEADERS + library/cpp/geohash/direction.h +) diff --git a/library/cpp/geohash/CMakeLists.txt b/library/cpp/geohash/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/library/cpp/geohash/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/library/cpp/geohash/CMakeLists.windows-x86_64.txt b/library/cpp/geohash/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..dfcb278a1f --- /dev/null +++ b/library/cpp/geohash/CMakeLists.windows-x86_64.txt @@ -0,0 +1,32 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +get_built_tool_path( + TOOL_enum_parser_bin + TOOL_enum_parser_dependency + tools/enum_parser/enum_parser + enum_parser +) + +add_library(library-cpp-geohash) +target_link_libraries(library-cpp-geohash PUBLIC + contrib-libs-cxxsupp + yutil + library-cpp-geo + tools-enum_parser-enum_serialization_runtime +) +target_sources(library-cpp-geohash PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/geohash/geohash.cpp +) +generate_enum_serilization(library-cpp-geohash + ${CMAKE_SOURCE_DIR}/library/cpp/geohash/direction.h + GEN_HEADER + ${CMAKE_BINARY_DIR}/library/cpp/geohash/direction.h_serialized.h + INCLUDE_HEADERS + library/cpp/geohash/direction.h +) diff --git a/library/cpp/geohash/direction.h b/library/cpp/geohash/direction.h new file mode 100644 index 0000000000..88a3e6061d --- /dev/null +++ b/library/cpp/geohash/direction.h @@ -0,0 +1,14 @@ +#pragma once + +namespace NGeoHash { + enum EDirection { + NORTH = 0, + NORTH_EAST, + EAST, + SOUTH_EAST, + SOUTH, + SOUTH_WEST, + WEST, + NORTH_WEST, + }; +} diff --git a/library/cpp/geohash/geohash.cpp b/library/cpp/geohash/geohash.cpp new file mode 100644 index 0000000000..6c6d65acab --- /dev/null +++ b/library/cpp/geohash/geohash.cpp @@ -0,0 +1,413 @@ +#include "geohash.h" + +#include <util/generic/xrange.h> + +namespace { + using TNeighbourDescriptors = NGeoHash::TNeighbours<TMaybe<NGeoHash::TGeoHashDescriptor>>; + const auto directions = GetEnumAllValues<NGeoHash::EDirection>(); + + const auto doubleEps = std::numeric_limits<double>::epsilon(); + + const NGeoHash::TBoundingBoxLL& GetGlobalBBox() { + static const NGeoHash::TBoundingBoxLL globalLimits({-180, -90}, {180, 90}); + return globalLimits; + } + + const TStringBuf base32EncodeTable = "0123456789bcdefghjkmnpqrstuvwxyz"; + + const ui64 base32DecodeMask = 0x1F; + constexpr int base32DecodeTableSize = 128; + + using TBase32DecodeTable = std::array<TMaybe<i8>, base32DecodeTableSize>; + + TBase32DecodeTable MakeBase32DecodeTable() { + TBase32DecodeTable result; + result.fill(Nothing()); + for (auto i : xrange(base32EncodeTable.size())) { + result[base32EncodeTable[i]] = i; + } + return result; + } + + const TBase32DecodeTable base32DecodeTable = MakeBase32DecodeTable(); +} + +namespace NGeoHash { + static const ui8 maxSteps = 62; + static const ui8 maxPrecision = TGeoHashDescriptor::StepsToPrecision(maxSteps); // 12 + + static const TNeighbours<std::pair<i8, i8>> neighborBitMoves = { + {1, 0}, // NORTH + {1, 1}, + {0, 1}, + {-1, 1}, + {-1, 0}, + {-1, -1}, + {0, -1}, + {1, -1}, + }; + + ui8 TGeoHashDescriptor::StepsToPrecision(ui8 steps) { + return steps / StepsPerPrecisionUnit; + } + + ui8 TGeoHashDescriptor::PrecisionToSteps(ui8 precision) { + return precision * StepsPerPrecisionUnit; + } + + /* Steps interleave starting from lon so for 5 steps 3 are lon-steps and 2 are lat-steps. + * Thus there are ceil(step/2) lon-steps and floor(step/2) lat-steps */ + std::pair<ui8, ui8> TGeoHashDescriptor::LatLonSteps() const { + return std::make_pair<ui8, ui8>(Steps / 2, (Steps + 1) / 2); + } + + struct TMagicNumber { + ui64 Mask; + ui8 Shift; + }; + + /* Interleave lower bits of x and y, so the bits of x + * are in the even positions and bits from y in the odd. + * e.g. Interleave64(0b101, 0b110) => 0b111001 + * From: https://graphics.stanford.edu/~seander/bithacks.html#InterleaveBMN + */ + ui64 TGeoHashDescriptor::Interleave64(ui32 x, ui32 y) { + // attention: magic numbers + constexpr TMagicNumber mortonMagicNumbers[] = { + {0x0000FFFF0000FFFF, 16}, + {0x00FF00FF00FF00FF, 8}, + {0x0F0F0F0F0F0F0F0F, 4}, + {0x3333333333333333, 2}, + {0x5555555555555555, 1}}; + + ui64 x64 = x; + ui64 y64 = y; + + for (const auto& magicNumber : mortonMagicNumbers) { + x64 = (x64 | (x64 << magicNumber.Shift)) & magicNumber.Mask; + y64 = (y64 | (y64 << magicNumber.Shift)) & magicNumber.Mask; + } + return x64 | (y64 << 1); + } + + /* Reverse the interleave process + * Deinterleave64(0b111001) => 0b101110 + * derived from http://stackoverflow.com/questions/4909263 */ + std::pair<ui32, ui32> TGeoHashDescriptor::Deinterleave64(ui64 z) { + constexpr TMagicNumber demortonMagicNumbers[] = { + {0x5555555555555555ULL, 0}, + {0x3333333333333333ULL, 1}, + {0x0F0F0F0F0F0F0F0FULL, 2}, + {0x00FF00FF00FF00FFULL, 4}, + {0x0000FFFF0000FFFFULL, 8}, + {0x00000000FFFFFFFFULL, 16}}; + + ui64 x = z; + ui64 y = z >> 1; + + for (const auto& magicNumber : demortonMagicNumbers) { + x = (x | (x >> magicNumber.Shift)) & magicNumber.Mask; + y = (y | (y >> magicNumber.Shift)) & magicNumber.Mask; + } + + return std::make_pair(x, y); + } + + std::pair<ui32, ui32> TGeoHashDescriptor::LatLonBits() const { + auto deinterleaved = Deinterleave64(Bits); + + if (Steps % 2) { + DoSwap(deinterleaved.first, deinterleaved.second); + } + return deinterleaved; + } + + void TGeoHashDescriptor::SetLatLonBits(ui32 latBits, ui32 lonBits) { + if (Steps % 2) { + Bits = Interleave64(lonBits, latBits); + } else { + Bits = Interleave64(latBits, lonBits); + } + } + + void TGeoHashDescriptor::InitFromLatLon(double latitude, double longitude, const TBoundingBoxLL& limits, ui8 steps) { + Steps = steps; + if (Steps > maxSteps) { + ythrow yexception() << "Invalid steps: available values: 0.." << ::ToString(maxSteps); + } + + if (limits.Width() < doubleEps || limits.Height() < doubleEps) { + ythrow yexception() << "Invalid limits: min/max for one of coordinates are equal"; + } + + if (latitude < limits.GetMinY() || latitude > limits.GetMaxY() || longitude < limits.GetMinX() || longitude > limits.GetMaxX()) { + ythrow yexception() << "Invalid point (" << latitude << ", " << longitude << "): outside of limits"; + } + + double lat01 = (latitude - limits.GetMinY()) / limits.Height(); + double lon01 = (longitude - limits.GetMinX()) / limits.Width(); + + auto llSteps = LatLonSteps(); + + /* convert to fixed point based on the step size */ + lat01 *= (1 << llSteps.first); + lon01 *= (1 << llSteps.second); + + /* If lon_steps > lat_step, last bit is lon-bit, otherwise last bit is lat-bit*/ + SetLatLonBits(lat01, lon01); + } + + TGeoHashDescriptor::TGeoHashDescriptor(double latitude, double longitude, const TBoundingBoxLL& limits, ui8 steps) { + InitFromLatLon(latitude, longitude, limits, steps); + } + + TGeoHashDescriptor::TGeoHashDescriptor(double latitude, double longitude, ui8 steps) { + InitFromLatLon(latitude, longitude, GetGlobalBBox(), steps); + } + + TGeoHashDescriptor::TGeoHashDescriptor(const NGeo::TPointLL& point, const TBoundingBoxLL& limits, ui8 steps) { + InitFromLatLon(point.Lat(), point.Lon(), limits, steps); + } + + TGeoHashDescriptor::TGeoHashDescriptor(const NGeo::TPointLL& point, ui8 steps) { + InitFromLatLon(point.Lat(), point.Lon(), GetGlobalBBox(), steps); + } + + TGeoHashDescriptor::TGeoHashDescriptor(const TString& hashString) { + if (hashString.size() > maxPrecision) { + ythrow yexception() << "hashString is too long: max length is " << ::ToString(maxPrecision); + } + + Bits = 0; + for (auto c : hashString) { + Bits <<= StepsPerPrecisionUnit; + Y_ENSURE(c >= 0); + const auto decodedChar = base32DecodeTable[c]; + Y_ENSURE(decodedChar.Defined()); + Bits |= decodedChar.GetRef(); + } + + Steps = PrecisionToSteps(hashString.size()); + } + + ui64 TGeoHashDescriptor::GetBits() const { + return Bits; + } + + ui8 TGeoHashDescriptor::GetSteps() const { + return Steps; + } + + TString TGeoHashDescriptor::ToString() const { + auto precision = StepsToPrecision(Steps); + + TStringStream stream; + + auto bits = Bits; + auto activeSteps = PrecisionToSteps(precision); + + bits >>= (Steps - activeSteps); + for (auto i : xrange(precision)) { + auto ix = (bits >> (StepsPerPrecisionUnit * ((precision - i - 1)))) & base32DecodeMask; + stream << base32EncodeTable[ix]; + } + + return stream.Str(); + } + + TBoundingBoxLL TGeoHashDescriptor::ToBoundingBox(const TBoundingBoxLL& limits) const { + auto llBits = LatLonBits(); + auto llSteps = LatLonSteps(); + + double latMultiplier = limits.Height() / (1ull << llSteps.first); + double lonMultiplier = limits.Width() / (1ull << llSteps.second); + + return { + { + limits.GetMinX() + lonMultiplier * llBits.second, + limits.GetMinY() + latMultiplier * llBits.first, + }, + { + limits.GetMinX() + lonMultiplier * (llBits.second + 1), + limits.GetMinY() + latMultiplier * (llBits.first + 1), + }}; + } + + TBoundingBoxLL TGeoHashDescriptor::ToBoundingBox() const { + return ToBoundingBox(GetGlobalBBox()); + } + + NGeo::TPointLL TGeoHashDescriptor::ToPoint(const TBoundingBoxLL& limits) const { + auto boundingBox = ToBoundingBox(limits); + return { + boundingBox.GetMinX() + boundingBox.Width() / 2, + boundingBox.GetMinY() + boundingBox.Height() / 2}; + } + + NGeo::TPointLL TGeoHashDescriptor::ToPoint() const { + return ToPoint(GetGlobalBBox()); + } + + TMaybe<TGeoHashDescriptor> TGeoHashDescriptor::GetNeighbour(EDirection direction) const { + TGeoHashDescriptor result(0, Steps); + auto llBits = LatLonBits(); + auto llSteps = LatLonSteps(); + std::pair<i8, i8> bitMove = neighborBitMoves[direction]; + + auto newLatBits = llBits.first + bitMove.first; + auto newLonBits = llBits.second + bitMove.second; + + // Overflow in lat means polar, so return Nothing + if (newLatBits >> llSteps.first != 0) { + return Nothing(); + } + + // Overflow in lon means 180-meridian, so just remove overflowed bits + newLonBits &= ((1 << llSteps.second) - 1); + result.SetLatLonBits(newLatBits, newLonBits); + return result; + } + + TNeighbourDescriptors TGeoHashDescriptor::GetNeighbours() const { + TNeighbourDescriptors result; + auto llBits = LatLonBits(); + auto llSteps = LatLonSteps(); + std::pair<i8, i8> bitMove; + + for (auto direction : directions) { + bitMove = neighborBitMoves[direction]; + + auto newLatBits = llBits.first + bitMove.first; + auto newLonBits = llBits.second + bitMove.second; + + // Overflow in lat means polar, so put Nothing + if (newLatBits >> llSteps.first != 0) { + result[direction] = Nothing(); + } else { + result[direction] = TGeoHashDescriptor(0, Steps); + // Overflow in lon means 180-meridian, so just remove overflowed bits + newLonBits &= ((1 << llSteps.second) - 1); + result[direction]->SetLatLonBits(newLatBits, newLonBits); + } + } + + return result; + } + + TVector<TGeoHashDescriptor> TGeoHashDescriptor::GetChildren(ui8 steps = StepsPerPrecisionUnit) const { + TVector<TGeoHashDescriptor> children(Reserve(1 << steps)); + ui8 childrenSteps = steps + Steps; + auto parentBits = Bits << steps; + if (childrenSteps > maxSteps) { + ythrow yexception() << "Resulting geohash steps are too big, available values: 0.." << ::ToString(maxSteps); + } + for (auto residue : xrange(1 << steps)) { + children.emplace_back(parentBits | residue, childrenSteps); + } + return children; + } + + /* Functions */ + + ui64 Encode(double latitude, double longitude, ui8 precision) { + auto descr = TGeoHashDescriptor( + latitude, longitude, TGeoHashDescriptor::PrecisionToSteps(precision)); + return descr.GetBits(); + } + ui64 Encode(const NGeo::TPointLL& point, ui8 precision) { + return TGeoHashDescriptor( + point, TGeoHashDescriptor::PrecisionToSteps(precision)) + .GetBits(); + } + + TString EncodeToString(double latitude, double longitude, ui8 precision) { + return TGeoHashDescriptor( + latitude, longitude, TGeoHashDescriptor::PrecisionToSteps(precision)) + .ToString(); + } + TString EncodeToString(const NGeo::TPointLL& point, ui8 precision) { + return TGeoHashDescriptor( + point, TGeoHashDescriptor::PrecisionToSteps(precision)) + .ToString(); + } + + NGeo::TPointLL DecodeToPoint(const TString& hashString) { + return TGeoHashDescriptor(hashString).ToPoint(); + } + NGeo::TPointLL DecodeToPoint(ui64 hash, ui8 precision) { + return TGeoHashDescriptor(hash, TGeoHashDescriptor::PrecisionToSteps(precision)).ToPoint(); + } + + TBoundingBoxLL DecodeToBoundingBox(const TString& hashString) { + return TGeoHashDescriptor(hashString).ToBoundingBox(); + } + + TBoundingBoxLL DecodeToBoundingBox(ui64 hash, ui8 precision) { + return TGeoHashDescriptor(hash, TGeoHashDescriptor::PrecisionToSteps(precision)).ToBoundingBox(); + } + + TMaybe<ui64> GetNeighbour(ui64 hash, EDirection direction, ui8 precision) { + auto neighbour = TGeoHashDescriptor( + hash, TGeoHashDescriptor::PrecisionToSteps(precision)) + .GetNeighbour(direction); + + if (neighbour.Defined()) { + return neighbour->GetBits(); + } else { + return Nothing(); + } + } + + TMaybe<TString> GetNeighbour(const TString& hashString, EDirection direction) { + auto neighbour = TGeoHashDescriptor(hashString).GetNeighbour(direction); + if (neighbour.Defined()) { + return neighbour->ToString(); + } else { + return Nothing(); + } + } + + TGeoHashBitsNeighbours GetNeighbours(ui64 hash, ui8 precision) { + TGeoHashBitsNeighbours result; + + auto neighbours = TGeoHashDescriptor( + hash, TGeoHashDescriptor::PrecisionToSteps(precision)) + .GetNeighbours(); + + for (auto direction : directions) { + if (neighbours[direction].Defined()) { + result[direction] = neighbours[direction]->GetBits(); + } else { + result[direction] = Nothing(); + } + } + + return result; + } + + TGeoHashStringNeighbours GetNeighbours(const TString& hashString) { + TGeoHashStringNeighbours result; + + auto neighbours = TGeoHashDescriptor( + hashString) + .GetNeighbours(); + + for (auto direction : directions) { + if (neighbours[direction].Defined()) { + result[direction] = neighbours[direction]->ToString(); + } else { + result[direction] = Nothing(); + } + } + return result; + } + + TVector<TString> GetChildren(const TString& hashString) { + TVector<TString> result(Reserve(base32EncodeTable.size())); + + for (auto ch : base32EncodeTable) { + result.push_back(hashString + ch); + } + return result; + } +} diff --git a/library/cpp/geohash/geohash.h b/library/cpp/geohash/geohash.h new file mode 100644 index 0000000000..7d270612e8 --- /dev/null +++ b/library/cpp/geohash/geohash.h @@ -0,0 +1,123 @@ +#pragma once + +/** + * @file + * @brief Strong (because it works) and independent (of contrib/libs/geohash) GeoHash implementation + * GeoHash algo: https://en.wikipedia.org/wiki/Geohash + * Useful links: + * 1. http://geohash.org - Main Site + * 2. https://dou.ua/lenta/articles/geohash - Geohash-based geopoints clusterization + * 3. http://www.movable-type.co.uk/scripts/geohash.html - bidirectional encoding and visualization + */ +#include <library/cpp/geohash/direction.h> +#include <library/cpp/geohash/direction.h_serialized.h> + +#include <library/cpp/geo/geo.h> + +#include <util/generic/maybe.h> +#include <util/generic/string.h> +#include <util/system/types.h> + +#include <array> + +namespace NGeoHash { + using TBoundingBoxLL = NGeo::TGeoBoundingBox; + static constexpr auto directionsCount = GetEnumItemsCount<EDirection>(); + + template <class T> + class TNeighbours: public std::array<T, directionsCount> { + public: + TNeighbours() = default; + + TNeighbours(std::initializer_list<T> list) { + Y_ASSERT(list.size() == directionsCount); + std::copy(list.begin(), list.end(), std::array<T, directionsCount>::begin()); + } + + const T& operator[](EDirection direction) const { + return std::array<T, directionsCount>::operator[](static_cast<size_t>(direction)); + } + + T& operator[](EDirection direction) { + return std::array<T, directionsCount>::operator[](static_cast<size_t>(direction)); + } + }; + + class TGeoHashDescriptor { + public: + TGeoHashDescriptor() noexcept + : Bits(0) + , Steps(0) + { + } + + TGeoHashDescriptor(ui64 bits, ui8 steps) noexcept + : Bits(bits) + , Steps(steps) + { + } + + TGeoHashDescriptor(double latitude, double longitude, ui8 steps); + TGeoHashDescriptor(double latitude, double longitude, const TBoundingBoxLL& limits, ui8 steps); + TGeoHashDescriptor(const NGeo::TPointLL& point, ui8 steps); + TGeoHashDescriptor(const NGeo::TPointLL& point, const TBoundingBoxLL& limits, ui8 steps); + + explicit TGeoHashDescriptor(const TString& hashString); + + ui64 GetBits() const; + ui8 GetSteps() const; + + TString ToString() const; + + NGeo::TPointLL ToPoint(const TBoundingBoxLL& limits) const; + NGeo::TPointLL ToPoint() const; + + TBoundingBoxLL ToBoundingBox(const TBoundingBoxLL& limits) const; + TBoundingBoxLL ToBoundingBox() const; + + TMaybe<TGeoHashDescriptor> GetNeighbour(EDirection direction) const; + TNeighbours<TMaybe<TGeoHashDescriptor>> GetNeighbours() const; + + TVector<TGeoHashDescriptor> GetChildren(ui8 steps) const; + + static ui8 StepsToPrecision(ui8 steps); + static ui8 PrecisionToSteps(ui8 precision); + + private: + void InitFromLatLon(double latitude, double longitude, const TBoundingBoxLL& limits, ui8 steps); + std::pair<ui8, ui8> LatLonSteps() const; + std::pair<ui32, ui32> LatLonBits() const; + void SetLatLonBits(ui32 latBits, ui32 lonBits); + static ui64 Interleave64(ui32 x, ui32 y); + static std::pair<ui32, ui32> Deinterleave64(ui64 interleaved); + + private: + static const ui8 StepsPerPrecisionUnit = 5; + ui64 Bits; + ui8 Steps; + }; + + ui64 Encode(double latitude, double longitude, ui8 precision); + ui64 Encode(const NGeo::TPointLL& point, ui8 precision); + + TString EncodeToString(double latitude, double longitude, ui8 precision); + TString EncodeToString(const NGeo::TPointLL& point, ui8 precision); + + NGeo::TPointLL DecodeToPoint(const TString& hashString); + NGeo::TPointLL DecodeToPoint(ui64 hash, ui8 precision); + + TBoundingBoxLL DecodeToBoundingBox(const TString& hashString); + TBoundingBoxLL DecodeToBoundingBox(ui64 hash, ui8 precision); + + TMaybe<ui64> GetNeighbour(ui64 hash, EDirection direction, ui8 precision); + TMaybe<TString> GetNeighbour(const TString& hashString, EDirection direction); + + using TGeoHashBitsNeighbours = TNeighbours<TMaybe<ui64>>; + using TGeoHashStringNeighbours = TNeighbours<TMaybe<TString>>; + + TGeoHashBitsNeighbours GetNeighbours(ui64 hash, ui8 precision); + TGeoHashStringNeighbours GetNeighbours(const TString& hashString); + + TVector<TString> GetChildren(const TString& hashString); + +} /* namespace NGeoHash */ diff --git a/library/cpp/geohash/ya.make b/library/cpp/geohash/ya.make new file mode 100644 index 0000000000..3350ca1cc6 --- /dev/null +++ b/library/cpp/geohash/ya.make @@ -0,0 +1,13 @@ +LIBRARY() + +PEERDIR( + library/cpp/geo +) + +SRCS( + geohash.cpp +) + +GENERATE_ENUM_SERIALIZATION_WITH_HEADER(direction.h) + +END() diff --git a/library/cpp/ipreg/CMakeLists.darwin-x86_64.txt b/library/cpp/ipreg/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..05b000b7da --- /dev/null +++ b/library/cpp/ipreg/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,53 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +get_built_tool_path( + TOOL_enum_parser_bin + TOOL_enum_parser_dependency + tools/enum_parser/enum_parser + enum_parser +) +get_built_tool_path( + TOOL_enum_parser_bin + TOOL_enum_parser_dependency + tools/enum_parser/enum_parser + enum_parser +) + +add_library(library-cpp-ipreg) +target_link_libraries(library-cpp-ipreg PUBLIC + contrib-libs-cxxsupp + yutil + cpp-getopt-small + library-cpp-json + library-cpp-geobase + library-cpp-int128 + tools-enum_parser-enum_serialization_runtime +) +target_sources(library-cpp-ipreg PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/address.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/checker.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/merge.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/range.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/reader.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/sources.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/split.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/stopwatch.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/writer.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/util_helpers.cpp +) +generate_enum_serilization(library-cpp-ipreg + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/address.h + INCLUDE_HEADERS + library/cpp/ipreg/address.h +) +generate_enum_serilization(library-cpp-ipreg + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/sources.h + INCLUDE_HEADERS + library/cpp/ipreg/sources.h +) diff --git a/library/cpp/ipreg/CMakeLists.linux-aarch64.txt b/library/cpp/ipreg/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..5e76739840 --- /dev/null +++ b/library/cpp/ipreg/CMakeLists.linux-aarch64.txt @@ -0,0 +1,54 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +get_built_tool_path( + TOOL_enum_parser_bin + TOOL_enum_parser_dependency + tools/enum_parser/enum_parser + enum_parser +) +get_built_tool_path( + TOOL_enum_parser_bin + TOOL_enum_parser_dependency + tools/enum_parser/enum_parser + enum_parser +) + +add_library(library-cpp-ipreg) +target_link_libraries(library-cpp-ipreg PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + cpp-getopt-small + library-cpp-json + library-cpp-geobase + library-cpp-int128 + tools-enum_parser-enum_serialization_runtime +) +target_sources(library-cpp-ipreg PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/address.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/checker.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/merge.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/range.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/reader.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/sources.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/split.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/stopwatch.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/writer.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/util_helpers.cpp +) +generate_enum_serilization(library-cpp-ipreg + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/address.h + INCLUDE_HEADERS + library/cpp/ipreg/address.h +) +generate_enum_serilization(library-cpp-ipreg + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/sources.h + INCLUDE_HEADERS + library/cpp/ipreg/sources.h +) diff --git a/library/cpp/ipreg/CMakeLists.linux-x86_64.txt b/library/cpp/ipreg/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..5e76739840 --- /dev/null +++ b/library/cpp/ipreg/CMakeLists.linux-x86_64.txt @@ -0,0 +1,54 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +get_built_tool_path( + TOOL_enum_parser_bin + TOOL_enum_parser_dependency + tools/enum_parser/enum_parser + enum_parser +) +get_built_tool_path( + TOOL_enum_parser_bin + TOOL_enum_parser_dependency + tools/enum_parser/enum_parser + enum_parser +) + +add_library(library-cpp-ipreg) +target_link_libraries(library-cpp-ipreg PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + cpp-getopt-small + library-cpp-json + library-cpp-geobase + library-cpp-int128 + tools-enum_parser-enum_serialization_runtime +) +target_sources(library-cpp-ipreg PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/address.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/checker.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/merge.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/range.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/reader.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/sources.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/split.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/stopwatch.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/writer.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/util_helpers.cpp +) +generate_enum_serilization(library-cpp-ipreg + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/address.h + INCLUDE_HEADERS + library/cpp/ipreg/address.h +) +generate_enum_serilization(library-cpp-ipreg + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/sources.h + INCLUDE_HEADERS + library/cpp/ipreg/sources.h +) diff --git a/library/cpp/ipreg/CMakeLists.txt b/library/cpp/ipreg/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/library/cpp/ipreg/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/library/cpp/ipreg/CMakeLists.windows-x86_64.txt b/library/cpp/ipreg/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..05b000b7da --- /dev/null +++ b/library/cpp/ipreg/CMakeLists.windows-x86_64.txt @@ -0,0 +1,53 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +get_built_tool_path( + TOOL_enum_parser_bin + TOOL_enum_parser_dependency + tools/enum_parser/enum_parser + enum_parser +) +get_built_tool_path( + TOOL_enum_parser_bin + TOOL_enum_parser_dependency + tools/enum_parser/enum_parser + enum_parser +) + +add_library(library-cpp-ipreg) +target_link_libraries(library-cpp-ipreg PUBLIC + contrib-libs-cxxsupp + yutil + cpp-getopt-small + library-cpp-json + library-cpp-geobase + library-cpp-int128 + tools-enum_parser-enum_serialization_runtime +) +target_sources(library-cpp-ipreg PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/address.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/checker.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/merge.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/range.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/reader.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/sources.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/split.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/stopwatch.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/writer.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/util_helpers.cpp +) +generate_enum_serilization(library-cpp-ipreg + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/address.h + INCLUDE_HEADERS + library/cpp/ipreg/address.h +) +generate_enum_serilization(library-cpp-ipreg + ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/sources.h + INCLUDE_HEADERS + library/cpp/ipreg/sources.h +) diff --git a/library/cpp/ipreg/address.cpp b/library/cpp/ipreg/address.cpp new file mode 100644 index 0000000000..83880ccbae --- /dev/null +++ b/library/cpp/ipreg/address.cpp @@ -0,0 +1,365 @@ +#include "address.h" + +#include <util/generic/mem_copy.h> +#include <util/stream/format.h> +#include <util/string/cast.h> +#include <util/string/hex.h> +#include <util/string/printf.h> +#include <util/string/split.h> +#include <util/string/type.h> +#include <util/string/vector.h> +#include <util/system/byteorder.h> +#include <util/network/socket.h> + +#include <sstream> + +namespace NIPREG { + +TAddress TAddress::ParseAny(TStringBuf str) { + if (str.find(':') != TStringBuf::npos) { + return ParseIPv6(str); + } else if (str.find('.') != TStringBuf::npos) { + return ParseIPv4(str); + } else if (IsNumber(str)) { + return ParseIPv4Num(str); // TODO(dieash@) IPv6Num + } + + ythrow yexception() << "Unrecognized IPREG address format: " << str; +} + +TAddress TAddress::ParseIPv6(TStringBuf str) { + TAddress addr; + if (inet_pton(AF_INET6, TString(str).c_str(), &addr.Data) != 1) + ythrow yexception() << "Failed to parse IPREG address " << str << " as IPv6"; + + return addr; +} + +TAddress TAddress::ParseIPv4(TStringBuf str) { + struct in_addr ipv4; + if (inet_aton(TString(str).c_str(), &ipv4) != 1) + ythrow yexception() << "Failed to parse IPREG address " << str << " as IPv4"; + + return FromIPv4Num(InetToHost(ipv4.s_addr)); +} + +TAddress TAddress::ParseIPv4Num(TStringBuf str) { + return FromIPv4Num(FromString<ui32>(str)); +} + +TAddress TAddress::ParseIPv6Num(TStringBuf str) { + return FromUint128(FromString<ui128>(str)); +} + +TAddress TAddress::FromBinary(unsigned char const * const data) { + TAddress addr; + MemCopy<unsigned char>(addr.Data, data, sizeof(addr.Data)); + return addr; +} + +TAddress TAddress::FromBinaryIPv4(unsigned char const * const data) { + return TAddress::FromIPv4Num( + (static_cast<ui32>(data[0]) << 24) | + (static_cast<ui32>(data[1]) << 16) | + (static_cast<ui32>(data[2]) << 8) | + (static_cast<ui32>(data[3])) + ); +} + +TAddress TAddress::FromIPv4Num(ui32 num) { + TAddress addr; + memset((void*)&addr.Data, 0x00, 10); + addr.Data[10] = 0xff; + addr.Data[11] = 0xff; + addr.Data[12] = (num >> 24) & 0xff; + addr.Data[13] = (num >> 16) & 0xff; + addr.Data[14] = (num >> 8) & 0xff; + addr.Data[15] = (num) & 0xff; + return addr; +} + +TAddress TAddress::FromUint128(ui128 intAddr) { + const auto hiBE = HostToInet(GetHigh(intAddr)); + const auto loBE = HostToInet(GetLow(intAddr)); + + TAddress addr; + ui64* dataPtr = reinterpret_cast<ui64*>(addr.Data); + MemCopy<ui64>(dataPtr, &hiBE, 1); + MemCopy<ui64>(dataPtr + 1, &loBE, 1); + + return addr; +} + +namespace { + void SetHostsBits(TAddress& addr, char value) { + addr.Data[ 8] = value; + addr.Data[ 9] = value; + addr.Data[10] = value; + addr.Data[11] = value; + addr.Data[12] = value; + addr.Data[13] = value; + addr.Data[14] = value; + addr.Data[15] = value; + } +} // anon-ns + +TAddress TAddress::MakeNet64Broadcast(TAddress base) { + SetHostsBits(base, 0xff); + return base; +} + +TAddress TAddress::MakeNet64Prefix(TAddress base) { + SetHostsBits(base, 0x00); + return base; +} + +const TAddress& TAddress::Lowest() { + static const TAddress first{{}}; + return first; +} + +const TAddress& TAddress::Highest() { + static const TAddress last{{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}; + return last; +} + +TString TAddress::AsIPv4() const { + return ToString(Data[12]) + "." + ToString(Data[13]) + "." + ToString(Data[14]) + "." + ToString(Data[15]); +} + +TString TAddress::AsIPv4Num() const { + ui32 addr = (ui32)Data[12] << 24 | (ui32)Data[13] << 16 | (ui32)Data[14] << 8 | Data[15]; + return ToString(addr); +} + +TString TAddress::AsIPv6() const { + TStringStream ss; + + for (size_t octet = 0; octet < sizeof(Data); octet++) { + ss << Hex(Data[octet], HF_FULL); + if (octet < 15 && octet & 1) + ss << ':'; + } + + TString s = ss.Str(); + s.to_lower(); + + return s; +} + +TString TAddress::AsIPv6Num() const { + return ToString(AsUint128()); +} + +TString TAddress::GetTextFromNetOrder() const { + char buf[INET6_ADDRSTRLEN]; + if (inet_ntop(AF_INET6, (void*)(&Data), buf, sizeof(buf)) == NULL) + ythrow yexception() << "Failed to stringify IPREG address"; + + return buf; +} + +namespace { + TString GetHexStr(ui64 v) { + return HexEncode(reinterpret_cast<const char*>(&v), sizeof(v)); + } + + void HexDumpToStream(std::stringstream& ss, ui64 beData) { + const auto dataHexStr = GetHexStr(beData); + const auto hostData = InetToHost(beData); + const auto hostDataStr = GetHexStr(hostData); + ss << "\t/big-end[" << beData << " / " << dataHexStr << "]\t/host[" << hostData << " / " << hostDataStr << "]\n"; + } +} // anon-ns + +TString TAddress::GetHexString(const bool deepView) const { + std::stringstream ss; + ss << HexEncode(TStringBuf(reinterpret_cast<const char*>(Data), 16)); + if (deepView) { + const ui64* dataPtr = reinterpret_cast<const ui64*>(Data); + + const auto hi = *dataPtr; + ss << "\nhigh-data"; HexDumpToStream(ss, hi); + + const auto lo = *(dataPtr + 1); + ss << "\nlow-data"; HexDumpToStream(ss, lo); + } + return ss.str().c_str(); +} + +TString TAddress::AsShortIP() const { + if (IsIPv4()) + return AsIPv4(); + else + return GetTextFromNetOrder(); +} + +TString TAddress::AsShortIPv6() const { + if (IsIPv4()) + return Sprintf("::ffff:%x:%x", (ui32)Data[12] << 8 | (ui32)Data[13], (ui32)Data[14] << 8 | (ui32)Data[15]); + else + return GetTextFromNetOrder(); +} + +TString TAddress::AsLongIP() const { + if (IsIPv4()) + return AsIPv4(); + else + return AsIPv6(); +} + +ui128 TAddress::AsUint128() const { + const ui64* dataPtr = reinterpret_cast<const ui64*>(Data); + return ui128(InetToHost(*dataPtr), InetToHost(*(dataPtr + 1))); +} + +ui64 TAddress::GetHigh64() const { + const ui64* dataPtr = reinterpret_cast<const ui64*>(Data); + return *dataPtr; +} + +ui64 TAddress::GetLow64() const { + const ui64* dataPtr = reinterpret_cast<const ui64*>(Data); + return *(dataPtr + 1); +} + +ui64 TAddress::GetHigh64LE() const { + return InetToHost(GetHigh64()); +} + +ui64 TAddress::GetLow64LE() const { + return InetToHost(GetLow64()); +} + +bool TAddress::IsNet64Broadcast() const { + static const auto NET64_HOSTS_MASK = TAddress::ParseAny("::ffff:ffff:ffff:ffff").GetLow64(); + const auto ownHostsBits = GetLow64(); + return ownHostsBits == NET64_HOSTS_MASK; +} + +bool TAddress::IsNet64Host() const { + const auto isSomeOwnHostsBitsOn = GetLow64() > 0; + return isSomeOwnHostsBitsOn && !IsNet64Broadcast(); +} + +TString TAddress::Format(EAddressFormat format) const { + switch (format) { + case EAddressFormat::IPV6: + return AsIPv6(); + case EAddressFormat::LONG_IP: + return AsLongIP(); + case EAddressFormat::SHORT_IP: + return AsShortIP(); + case EAddressFormat::NUMERIC_IPV4: + return AsIPv4Num(); + case EAddressFormat::NUMERIC_IPV6: + return AsIPv6Num(); + case EAddressFormat::NTOA: + return GetTextFromNetOrder(); + case EAddressFormat::SHORT_IPV6: + return AsShortIPv6(); + } +} + +bool TAddress::IsIPv4() const { + static const unsigned char mask[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff }; + return memcmp(Data, mask, sizeof(mask)) == 0; +} + +TAddress TAddress::Next() const { + if (Highest() == *this) { + return Highest(); + } + + TAddress addr; + bool carry = 1; + for (ssize_t octet = 15; octet >= 0; octet--) { + addr.Data[octet] = Data[octet] + carry; + carry = carry && !addr.Data[octet]; + } + + return addr; +} + +TAddress TAddress::Prev() const { + if (Lowest() == *this) { + return Lowest(); + } + + TAddress addr{}; + bool carry = 1; + for (ssize_t octet = 15; octet >= 0; octet--) { + addr.Data[octet] = Data[octet] - carry; + carry = carry && !Data[octet]; + } + + return addr; +} + +double TAddress::operator-(const TAddress& rhs) const { + double diff = 0.0; + for (ssize_t octet = 0; octet < 16; octet++) { + diff = diff * 256.0 + (static_cast<int>(Data[octet]) - static_cast<int>(rhs.Data[octet])); + } + return diff; +} + +ui128 TAddress::Distance(const TAddress& a, const TAddress& b) { + const auto& intA = a.AsUint128(); + const auto& intB = b.AsUint128(); + return (a > b) ? (intA - intB) : (intB - intA); +} + +namespace { + constexpr size_t MAX_IPV6_MASK_LEN = 16 * 8; + constexpr size_t MAX_IPV4_MASK_LEN = 4 * 8; + constexpr size_t IPV4_IN6_MASK_BASE = MAX_IPV6_MASK_LEN - MAX_IPV4_MASK_LEN; + + TAddress SetMaskBits(const TAddress& addr, const size_t wantedMaskLen) { + auto maskLen = wantedMaskLen; + if (addr.IsIPv4() && maskLen && maskLen <= MAX_IPV4_MASK_LEN) { + maskLen += IPV4_IN6_MASK_BASE; + } + + if (maskLen == 0 || maskLen > MAX_IPV6_MASK_LEN || (addr.IsIPv4() && maskLen < IPV4_IN6_MASK_BASE)) { + ythrow yexception() << "strange mask (calc/wanted) " << maskLen << "/" << wantedMaskLen << "; " << addr; + } + + const int octetsForUpdate = (MAX_IPV6_MASK_LEN - maskLen) / 8; + const int bitsForUpdate = (MAX_IPV6_MASK_LEN - maskLen) % 8; + + size_t currOctet = 15; + TAddress addrWithMask = addr; + + for (int octetNum = 0; octetNum != octetsForUpdate; ++octetNum) { + addrWithMask.Data[currOctet--] = 0xff; + } + + for (int bitNum = 0; bitNum != bitsForUpdate; ++bitNum) { + addrWithMask.Data[currOctet] ^= 1 << bitNum; + } + + return addrWithMask; + } +} // anon-ns + +TNetwork::TNetwork(const TString& str) + : TNetwork(static_cast<TVector<TString>>(StringSplitter(str).Split('/').SkipEmpty())) +{} + +TNetwork::TNetwork(const TVector<TString>& data) + : TNetwork(data.size() ? data[0] : "", + data.size() > 1 ? FromStringWithDefault<size_t>(data[1]) : 0) +{} + +TNetwork::TNetwork(const TString& net, size_t maskLen) + : begin(TAddress::ParseAny(net)) + , end(SetMaskBits(begin, maskLen)) +{} + +} + +IOutputStream& operator<<(IOutputStream& output, const NIPREG::TAddress& addr) { + output << addr.AsShortIPv6(); + return output; +} diff --git a/library/cpp/ipreg/address.h b/library/cpp/ipreg/address.h new file mode 100644 index 0000000000..9071418d5b --- /dev/null +++ b/library/cpp/ipreg/address.h @@ -0,0 +1,137 @@ +#pragma once + +#include <library/cpp/int128/int128.h> + +#include <util/generic/string.h> +#include <util/digest/murmur.h> +#include <util/string/cast.h> + +namespace NIPREG { + +struct TAddress { + enum class EAddressFormat { + IPV6 = 0x00 /* "ipv6" */, + LONG_IP = 0x01 /* "long" */, + SHORT_IP = 0x02 /* "short" */, + NUMERIC_IPV4 = 0x03 /* "num4" */, + NTOA = 0x04 /* "n2a" */, + SHORT_IPV6 = 0x05 /* "short-ipv6" */, + NUMERIC_IPV6 = 0x06 /* "num" */, + }; + + unsigned char Data[16] = {0}; // NOTA BENE: network byte order (Big-Endian) + + // Comparison + bool operator==(const TAddress& other) const { + return memcmp(Data, other.Data, sizeof(Data)) == 0; + } + + bool operator<(const TAddress& other) const { + return memcmp(Data, other.Data, sizeof(Data)) < 0; + } + + bool operator>(const TAddress& other) const { + return memcmp(Data, other.Data, sizeof(Data)) > 0; + } + + bool operator!=(const TAddress& other) const { + return !(*this == other); + } + + bool operator<=(const TAddress& other) const { + return !(*this > other); + } + + bool operator>=(const TAddress& other) const { + return !(*this < other); + } + + double operator-(const TAddress& rhs) const; + + // Parsing + static TAddress ParseAny(TStringBuf str); + + static TAddress ParseIPv6(TStringBuf str); + static TAddress ParseIPv4(TStringBuf str); + static TAddress ParseIPv4Num(TStringBuf str); + static TAddress ParseIPv6Num(TStringBuf str); + + static TAddress FromIPv4Num(ui32 num); + static TAddress FromUint128(ui128 addr); + static TAddress FromBinary(unsigned char const * data); + static TAddress FromBinaryIPv4(unsigned char const * const data); + + static TAddress MakeNet64Broadcast(TAddress base); + static TAddress MakeNet64Prefix(TAddress base); + + static const TAddress& Lowest(); + static const TAddress& Highest(); + + // Inspecting + TString AsIPv4() const; + TString AsIPv4Num() const; + TString AsIPv6() const; + TString AsIPv6Num() const; + TString GetTextFromNetOrder() const; + TString GetHexString(bool deepView = false) const; + + TString AsShortIP() const; + TString AsShortIPv6() const; + TString AsLongIP() const; + + ui128 AsUint128() const; + ui64 GetHigh64() const; + ui64 GetLow64() const; + ui64 GetHigh64LE() const; + ui64 GetLow64LE() const; + + bool IsNet64Broadcast() const; + bool IsNet64Host() const; + + TAddress GetNet64() const { + return TAddress::FromUint128(ui128{GetHigh64LE()} << 64); + } + + TAddress GetPrevNet64() const { + return TAddress::FromUint128(ui128{GetHigh64LE() - 1} << 64); + } + + TAddress GetNextNet64() const { + return TAddress::FromUint128(ui128{GetHigh64LE() + 1} << 64); + } + + TString Format(EAddressFormat format) const; + + int GetType() const { return IsIPv4() ? 4 : 6; } + bool IsIPv4() const; + + // Mutating + TAddress Next() const; + TAddress Prev() const; + + static ui128 Distance(const TAddress& a, const TAddress& b); +}; + +using EAddressFormat = TAddress::EAddressFormat; + +struct TNetwork { + TAddress begin; + TAddress end; + + TNetwork(const TString& str = "0.0.0.0/32"); + +private: + TNetwork(const TVector<TString>& data); + TNetwork(const TString& net, size_t mask); +}; + +} // NIPREG + +template <> +struct THash<NIPREG::TAddress> { + inline size_t operator()(const NIPREG::TAddress& address) const { + return MurmurHash<size_t>((const void*)address.Data, 16); + } +}; + +IOutputStream& operator<<(IOutputStream& output, const NIPREG::TAddress& addr); diff --git a/library/cpp/ipreg/checker.cpp b/library/cpp/ipreg/checker.cpp new file mode 100644 index 0000000000..9c41d27dc0 --- /dev/null +++ b/library/cpp/ipreg/checker.cpp @@ -0,0 +1,47 @@ +#include "checker.h" + +namespace NIPREG { + +void TChecker::CheckNextFatal(const TAddress& first, const TAddress& last) { + if (!CheckNext(first, last)) + ythrow yexception() << "IPREG format error: " << first.AsIPv6() << " - " << last.AsIPv6(); +} + +TFlatChecker::TFlatChecker() : HasState(false) { +} + +bool TFlatChecker::CheckNext(const TAddress& first, const TAddress& last) { + bool result = true; + + if (first > last) + result = false; + + if (HasState && first <= PrevLast) + result = false; + + PrevLast = last; + HasState = true; + + return result; +} + +TIntersectingChecker::TIntersectingChecker() : HasState(false) { +} + +bool TIntersectingChecker::CheckNext(const TAddress& first, const TAddress& last) { + bool result = true; + + if (first > last) + result = false; + + if (HasState && (first < PrevFirst || (first == PrevFirst && last < PrevLast))) + result = false; + + PrevFirst = first; + PrevLast = last; + HasState = true; + + return result; +} + +} diff --git a/library/cpp/ipreg/checker.h b/library/cpp/ipreg/checker.h new file mode 100644 index 0000000000..1a04e62e77 --- /dev/null +++ b/library/cpp/ipreg/checker.h @@ -0,0 +1,37 @@ +#pragma once + +#include "address.h" + +namespace NIPREG { + +class TChecker { +public: + virtual ~TChecker() {} + + virtual bool CheckNext(const TAddress& first, const TAddress& last) = 0; + + void CheckNextFatal(const TAddress& first, const TAddress& last); +}; + +class TFlatChecker: public TChecker { +private: + TAddress PrevLast; + bool HasState; + +public: + TFlatChecker(); + virtual bool CheckNext(const TAddress& first, const TAddress& last); +}; + +class TIntersectingChecker: public TChecker { +private: + TAddress PrevFirst; + TAddress PrevLast; + bool HasState; + +public: + TIntersectingChecker(); + virtual bool CheckNext(const TAddress& first, const TAddress& last); +}; + +} diff --git a/library/cpp/ipreg/merge.cpp b/library/cpp/ipreg/merge.cpp new file mode 100644 index 0000000000..d31e9dce5d --- /dev/null +++ b/library/cpp/ipreg/merge.cpp @@ -0,0 +1,69 @@ +#include "merge.h" + +namespace NIPREG { + +void MergeIPREGS(TReader &a, TReader& b, std::function<void(const TAddress& first, const TAddress& last, const TString *a, const TString *b)>&& proc) { + bool hasA = a.Next(); + bool hasB = b.Next(); + + TAddress top = TAddress::Lowest(); + TAddress bottom; + + do { + // tweak ranges we've passed + if (hasA && top > a.Get().Last) + hasA = a.Next(); + if (hasB && top > b.Get().Last) + hasB = b.Next(); + + if (!hasA && !hasB) { + // both rangesets have ended + bottom = TAddress::Highest(); + proc(top, bottom, nullptr, nullptr); + break; + } + + const bool inA = hasA && a.Get().First <= top; + const bool inB = hasB && b.Get().First <= top; + + if (!hasA) { + // rangeset a has ended + if (inB) { + bottom = b.Get().Last; + proc(top, bottom, nullptr, &b.Get().Data); + } else { + bottom = b.Get().First.Prev(); + proc(top, bottom, nullptr, nullptr); + } + } else if (!hasB) { + // rangeset b has ended + if (inA) { + bottom = a.Get().Last; + proc(top, bottom, &a.Get().Data, nullptr); + } else { + bottom = a.Get().First.Prev(); + proc(top, bottom, nullptr, nullptr); + } + } else if (inA && inB) { + // inside both ranges + bottom = Min(a.Get().Last, b.Get().Last); + proc(top, bottom, &a.Get().Data, &b.Get().Data); + } else if (inA) { + // only in range a + bottom = Min(a.Get().Last, b.Get().First.Prev()); + proc(top, bottom, &a.Get().Data, nullptr); + } else if (inB) { + // only in range b + bottom = Min(b.Get().Last, a.Get().First.Prev()); + proc(top, bottom, nullptr, &b.Get().Data); + } else { + // outside both ranges + bottom = Min(a.Get().First.Prev(), a.Get().First.Prev()); + proc(top, bottom, nullptr, nullptr); + } + + top = bottom.Next(); + } while (bottom != TAddress::Highest()); +} + +} diff --git a/library/cpp/ipreg/merge.h b/library/cpp/ipreg/merge.h new file mode 100644 index 0000000000..123b88276c --- /dev/null +++ b/library/cpp/ipreg/merge.h @@ -0,0 +1,11 @@ +#pragma once + +#include "reader.h" + +#include <functional> + +namespace NIPREG { + +void MergeIPREGS(TReader &a, TReader& b, std::function<void(const TAddress& first, const TAddress& last, const TString *a, const TString *b)>&& proc); + +} diff --git a/library/cpp/ipreg/range.cpp b/library/cpp/ipreg/range.cpp new file mode 100644 index 0000000000..1b90022482 --- /dev/null +++ b/library/cpp/ipreg/range.cpp @@ -0,0 +1,198 @@ +#include "range.h" + +#include "util_helpers.h" + +#include <library/cpp/int128/int128.h> +#include <util/generic/maybe.h> +#include <util/string/split.h> +#include <util/string/vector.h> + +#include <stdexcept> + +namespace NIPREG { + +namespace { + EAddressFormat CurrentFormat = EAddressFormat::SHORT_IPV6; + + void throwExceptionWithFormat(const TString& line) { + throw yexception() << "wanted format: ${ip-begin}-${ip-end}[\t${data}]; $input := '" << line << "'"; + } + + void throwIfReverseOrder(TAddress first, TAddress last) { + if (first > last) { + const TString err_msg = "reverse order of addresses (first / last) => " + first.AsIPv6() + " / " + last.AsIPv6(); + throw std::runtime_error(err_msg.data()); + } + } +} // anon-ns + +TRange::TRange(TAddress first, TAddress last, const TString& data) + : First(first) + , Last(last) + , Data(data) +{ + throwIfReverseOrder(First, Last); +} + +TRange::TRange(const TNetwork& net, const TString& data) + : TRange(net.begin, net.end, data) +{ +} + +ui128 TRange::GetAddrsQty() const { + return TAddress::Distance(First, Last) + 1; +} + +TRange TRange::BuildRange(const TString& line, bool isEmptyData, const TString& dataDelim) { + const TVector<TString> parts = StringSplitter(line).SplitBySet(dataDelim.data()).SkipEmpty(); + if (parts.empty()) { + throwExceptionWithFormat(line); + } + + if (TString::npos != parts[0].find('/')) { + const auto data = (2 == parts.size()) ? parts[1] : ""; + return TRange(TNetwork(parts[0]), data); + } + + const TVector<TString> range_parts = StringSplitter(parts[0]).SplitBySet(" -\t").SkipEmpty(); + if (2 != range_parts.size() || range_parts[0].empty() || range_parts[1].empty()) { + throwExceptionWithFormat(line); + } + + if (!isEmptyData && (2 != parts.size() || parts[1].empty())) { + throwExceptionWithFormat(line); + } + + const auto& data = (2 == parts.size()) ? parts[1] : ""; + return TRange(TAddress::ParseAny(range_parts[0]), TAddress::ParseAny(range_parts[1]), data); +} + +bool TRange::Contains(const TRange& range) const { + return First <= range.First && range.Last <= Last; +} + +bool TRange::Contains(const TAddress& ip) const { + return First <= ip && ip <= Last; +} + +void SetIpFullOutFormat() { + CurrentFormat = EAddressFormat::IPV6; +} + +void SetIpShortOutFormat() { + CurrentFormat = EAddressFormat::SHORT_IPV6; +} + +void TRange::DumpTo(IOutputStream& output, bool withData, EAddressFormat format) const { + output << First.Format(format) << '-' << Last.Format(format); + if (withData) { + output << '\t' << Data; + } +} + +bool TRange::IsIpv6Only() const { + return 6 == First.GetType() && 6 == Last.GetType(); +} + +bool TRange::IsIpv4Only() const { + return 4 == First.GetType() && 4 == Last.GetType(); +} + +bool TRange::IsRangeInSingleNet64() const { + return First.GetHigh64() == Last.GetHigh64(); +} + +TRange TRange::BuildRangeByFirst(const TRange& range, int prefix) { + Y_UNUSED(prefix); + return TRange(TAddress::MakeNet64Prefix(range.First), + TAddress::MakeNet64Broadcast(range.IsRangeInSingleNet64() ? range.Last : range.Last.GetPrevNet64()) , + range.Data + ); +} + +TRange TRange::BuildRangeByLast(const TRange& range, int prefix) { + Y_UNUSED(prefix); + const auto prevLast = TAddress::MakeNet64Broadcast(range.Last.GetPrevNet64()); + return TRange(range.First, prevLast, range.Data); +// const auto prevLast = TAddress::MakeNet64Broadcast(range.Last); +// return TRange(TAddress::MakeNet64Prefix(range.First), prevLast, range.Data); +} + +TVector<TRange> SplitRangeNets(const TRange& origRange, bool addOrigSize, int maskLen) { + Y_UNUSED(maskLen); + + static const auto firstCheckedIpv6Prefix = TAddress::ParseAny("2000::"); + + const auto& CalcNetSize = [&](const TRange& range) { + static const auto MAX_FOR_DIGITS_ANSWER = ui128{1 << 30}; + const auto netSize = range.GetAddrsQty(); + return (netSize < MAX_FOR_DIGITS_ANSWER) ? ToString(netSize) : "huge"; + }; + + const auto& AddSizeField = [&](TRange& changedRange, const TRange& origAddrRange) { + if (addOrigSize) { + changedRange.Data = AddJsonAttrs({"orig_net_size"}, changedRange.Data, TMaybe<TString>(CalcNetSize(origAddrRange))); + } + }; + + if (origRange.Last <= firstCheckedIpv6Prefix) { + return {origRange}; + } + + if (origRange.IsRangeInSingleNet64()) { + TRange theOne{ + TAddress::MakeNet64Prefix(origRange.First), + TAddress::MakeNet64Broadcast(origRange.Last), + origRange.Data + }; + AddSizeField(theOne, origRange); + return {theOne}; + } + + TRange range{origRange}; + TVector<TRange> result; { + // 1st + TRange byFirst{TAddress::MakeNet64Prefix(range.First),TAddress::MakeNet64Broadcast(range.First), range.Data}; + AddSizeField(byFirst, {range.First, byFirst.Last, ""}); + result.push_back(byFirst); + + // maybe 2nd + range.First = byFirst.Last.Next(); + if (!range.IsRangeInSingleNet64()) { + const TAddress lastPrefix = TAddress::MakeNet64Prefix(range.Last); + + TRange inTheMiddle{TAddress::MakeNet64Prefix(range.First), lastPrefix.Prev(), range.Data}; + AddSizeField(inTheMiddle, inTheMiddle); + result.push_back(inTheMiddle); + + range.First = lastPrefix; + } + + // the last + TRange byLast{range.First, TAddress::MakeNet64Broadcast(range.Last), range.Data}; + AddSizeField(byLast, {byLast.First, range.Last, ""}); + result.push_back(byLast); + } + return result; +} + +bool operator==(const TRange& lhs, const TRange& rhs) { + return lhs.First == rhs.First && lhs.Last == rhs.Last; +} + +} // ns IPREG + +IInputStream& operator>>(IInputStream& input, NIPREG::TRange& range) { + TString line; + if (!input.ReadLine(line)) { + throw std::runtime_error("unable to load data from stream"); + } + range = NIPREG::TRange::BuildRange(line); + return input; +} + +IOutputStream& operator<<(IOutputStream& output, const NIPREG::TRange& range) { + range.DumpTo(output, true, NIPREG::CurrentFormat); + output << "\n"; + return output; +} diff --git a/library/cpp/ipreg/range.h b/library/cpp/ipreg/range.h new file mode 100644 index 0000000000..15b2c693b0 --- /dev/null +++ b/library/cpp/ipreg/range.h @@ -0,0 +1,50 @@ +#pragma once + +#include "address.h" + +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/stream/input.h> +#include <util/stream/output.h> + +#include <stdexcept> + +namespace NIPREG { + +struct TRange { + TAddress First; + TAddress Last; + TString Data; + + TRange() = default; + TRange(TAddress first, TAddress last, const TString& data); + TRange(const TNetwork& net, const TString& data); + + ui128 GetAddrsQty() const; + void DumpTo(IOutputStream& output, bool withData = true, EAddressFormat format = EAddressFormat::SHORT_IP) const; + + static TRange BuildRange(const TString& line, bool isEmptyData = false, const TString& dataDelim = "\t"); + bool Contains(const TRange& range) const; + bool Contains(const TAddress& ip) const; + + static TRange BuildRangeByFirst(const TRange& range, int prefix = 64); + static TRange BuildRangeByLast(const TRange& range, int prefix = 64); + + bool IsIpv6Only() const; + bool IsIpv4Only() const; + + bool IsRangeInSingleNet64() const; +}; +using TGenericEntry = TRange; + +void SetIpFullOutFormat(); +void SetIpShortOutFormat(); + +TVector<TRange> SplitRangeNets(const TRange& range, bool addOrigSize = false, int maskLen = 64); + +bool operator==(const TRange& lhs, const TRange& rhs); +inline bool operator!=(const TRange& lhs, const TRange& rhs) { return !(lhs == rhs); } +} // ns NIPREG + +IInputStream& operator>>(IInputStream& input, NIPREG::TRange& range); +IOutputStream& operator<<(IOutputStream& output, const NIPREG::TRange& range); diff --git a/library/cpp/ipreg/reader.cpp b/library/cpp/ipreg/reader.cpp new file mode 100644 index 0000000000..2e4ae1b178 --- /dev/null +++ b/library/cpp/ipreg/reader.cpp @@ -0,0 +1,82 @@ +#include "reader.h" + +#include <util/stream/file.h> + +namespace NIPREG { + +namespace { + const TString DASH_FNAME = "-"; +} + +TReader::TReader(const TString& filename, bool isEmptyData, const TString& dataDelim) + : OwnedStreamPtr((filename.empty() || filename == DASH_FNAME) ? nullptr : new TFileInput(filename)) + , Stream(OwnedStreamPtr ? *OwnedStreamPtr.Get() : Cin) + , IsEmptyData(isEmptyData) + , DataDelim(dataDelim) +{ +} + +TReader::TReader(IInputStream& stream, bool isEmptyData, const TString& dataDelim) + : Stream(stream) + , IsEmptyData(isEmptyData) + , DataDelim(dataDelim) +{ +} + +bool TReader::Next() { + TString line; + if (!Stream.ReadLine(line)) + return false; + + CurrentEntry = TRange::BuildRange(line, IsEmptyData, DataDelim); + if (CurrentEntry.Data.empty()) { + if (!IsEmptyData) { + throw yexception() << "empty data part detected for [" << line << "]"; + } + CurrentEntry.Data = ""; + } + return true; +} + +TReverseByLastIpReader::TReverseByLastIpReader(const TString& filename, bool isEmptyData, const TString& dataDelim) + : TParent(filename, isEmptyData, dataDelim) +{ + Valid = TParent::Next(); +} + +TReverseByLastIpReader::TReverseByLastIpReader(IInputStream& stream, bool isEmptyData, const TString& dataDelim) + : TParent(stream, isEmptyData, dataDelim) +{ + Valid = TParent::Next(); +} + +bool TReverseByLastIpReader::Next() { + if (!CurrentEntries.empty()) { + CurrentEntries.pop_back(); + } + + if (CurrentEntries.empty()) { + return PrepareNextEntries(); + } else { + return true; + } +} + +const TGenericEntry& TReverseByLastIpReader::Get() const { + return CurrentEntries.back(); +} + +bool TReverseByLastIpReader::PrepareNextEntries() { + if (!Valid) { + return false; + } + + do { + CurrentEntries.push_back(TParent::Get()); + Valid = TParent::Next(); + } while (Valid && TParent::Get().First == CurrentEntries.back().First); + + return true; +} + +} // NIPREG diff --git a/library/cpp/ipreg/reader.h b/library/cpp/ipreg/reader.h new file mode 100644 index 0000000000..b68faedcf9 --- /dev/null +++ b/library/cpp/ipreg/reader.h @@ -0,0 +1,57 @@ +#pragma once + +#include "range.h" + +#include <util/generic/ptr.h> +#include <util/generic/string.h> +#include <util/stream/input.h> + +namespace NIPREG { + +class TReader { +public: + TReader(const TString& filename = "", bool isEmptyData = false, const TString& dataDelim = "\t"); + TReader(IInputStream& stream, bool isEmptyData = false, const TString& dataDelim = "\t"); + + virtual bool Next(); + + virtual const TGenericEntry& Get() const { + return CurrentEntry; + } + + operator IInputStream&() { + return Stream; + } + + virtual ~TReader() = default; + +private: + TAutoPtr<IInputStream> OwnedStreamPtr; + IInputStream& Stream; + + bool IsEmptyData = false; + const TString DataDelim; + + TGenericEntry CurrentEntry; +}; + +class TReverseByLastIpReader : public TReader { +public: + using TParent = TReader; + + explicit TReverseByLastIpReader(const TString& filename = "", bool isEmptyData = false, const TString& dataDelim = "\t"); + explicit TReverseByLastIpReader(IInputStream& stream, bool isEmptyData = false, const TString& dataDelim = "\t"); + + bool Next() override; + + const TGenericEntry& Get() const override; + +private: + bool PrepareNextEntries(); + +private: + bool Valid = false; + TVector<TGenericEntry> CurrentEntries; +}; + +} // NIPREG diff --git a/library/cpp/ipreg/sources.cpp b/library/cpp/ipreg/sources.cpp new file mode 100644 index 0000000000..70e4b2a6da --- /dev/null +++ b/library/cpp/ipreg/sources.cpp @@ -0,0 +1,100 @@ +#include "sources.h" + +#include <cstdint> +#include <stdexcept> + +namespace NIPREG { + +const ui32 ML_COEFF_DEFAULT = 50000; +ui32 ML_COEFFICIENT = ML_COEFF_DEFAULT; + +void SetCoefficient(ui32 type, ui32 value) { + switch (type) { + case SOURCE_ML: + ML_COEFFICIENT = value; + break; + default: + throw std::runtime_error("unsupported setcoeff-type"); + } +} + +double GetSourceCoefficient(ui32 type) { + switch (type) { + case SOURCE_MAIL: return 1; + case SOURCE_PHONE: return 3; + case SOURCE_GEO: return 4; + case SOURCE_COUNTRY: return 100; + case SOURCE_DOMAIN_NAME: return 1; + case SOURCE_MANUAL: return 1; + case SOURCE_YANDEX_NETWORK: return 1000; // NB: in yandex_noc source weight := 10K + case SOURCE_SPECIAL_NETWORK: return 1000000; + case SOURCE_PROVIDERS: return 50; + case SOURCE_MAXMIND: return 4; + case SOURCE_UNITED_UID_YANDEX_MAPS: return 0.7; + case SOURCE_RELIABILITY_AROUND: return 1; + case SOURCE_UNITED_UID_WEATHER: return 0.9; + case SOURCE_UNITED_UID_YANDEX_GID: return 1; + case SOURCE_UNITED_UID_SEARCH_QUERY: return 1.5; + case SOURCE_UNITED_UID_SEARCH_IN_REG: return 2; + case SOURCE_BGP_ASPATH_COMMUNITY: return 10; + case SOURCE_ML: return ML_COEFFICIENT; + } + return 0; +} + +bool SourceWantApplyDepthCoeff(ui32 source_type) { + switch (source_type) { + case SOURCE_MAIL: + case SOURCE_PHONE: + case SOURCE_GEO: + case SOURCE_COUNTRY: + case SOURCE_DOMAIN_NAME: + return true; + default: + return false; + } +} + +bool SourceWantApplyNetsizeCoeff(ui32 source_type) { + return SourceWantApplyDepthCoeff(source_type); +} + +bool SourceIsHuman(ui32 source_type) { + switch (source_type) { + case SOURCE_UNITED_UID_SEARCH_QUERY: + case SOURCE_UNITED_UID_SEARCH_IN_REG: + case SOURCE_UNITED_UID_WEATHER: + case SOURCE_UNITED_UID_YANDEX_GID: + case SOURCE_UNITED_UID_YANDEX_MAPS: + return true; + default: + return false; + } +} + +bool SourceIsForRegionNormalize(ui32 source_type) { + return SourceIsHuman(source_type); +} + +bool SourceIsForEnoughHumanData(ui32 source_type) { + switch (source_type) { + case SOURCE_COUNTRY: + case SOURCE_MANUAL: + case SOURCE_PROVIDERS: + case SOURCE_YANDEX_NETWORK: + case SOURCE_SPECIAL_NETWORK: + return true; + default: + return SourceIsHuman(source_type); + } +} + +bool SourceIsForFewHumanData(ui32 source_type) { + return !SourceIsHuman(source_type); +} + +bool SourceIsForReliability(ui32 source_type) { + return SourceIsHuman(source_type) || SOURCE_YANDEX_NETWORK == source_type; +} + +} // NIPREG diff --git a/library/cpp/ipreg/sources.h b/library/cpp/ipreg/sources.h new file mode 100644 index 0000000000..a517e57cb8 --- /dev/null +++ b/library/cpp/ipreg/sources.h @@ -0,0 +1,53 @@ +#pragma once + +#include <util/system/types.h> + +namespace NIPREG { + +// TODO(dieash@) make some automation/spicification via enabled sources (with full list) +enum ESourceType { + // TODO(dieash@) full list of known src-types in choice-region-data: + // https://yql.yandex-team.ru/Operations/XEo-amim9Z2_PCkcZgQ0Wu-sqXAm1K8NMPesswuPzbk= + SOURCE_UNKNOWN = 0, // stub + SOURCE_MAIL = 1 /* "MAIL" */, // ripe src + SOURCE_PHONE = 2 /* "PHONE" */, // ripe src + SOURCE_GEO = 3 /* "GEO" */, // ripe src + SOURCE_COUNTRY = 4 /* "COUNTRY" */, // ripe, delegated, maxmind src + SOURCE_DOMAIN_NAME = 5 /* "DOMAIN_NAME" */, // ripe src + SOURCE_MANUAL = 6 /* "MANUAL" */, // manual src + SOURCE_YANDEX_NETWORK = 9 /* "YANDEX_NETWORK" */, // yandex-noc src + SOURCE_SPECIAL_NETWORK = 10 /* "SPECIAL_NETWORK" */, // spec-net src + SOURCE_PROVIDERS = 15 /* "PROVIDERS" */, // ripe src + SOURCE_MAXMIND = 17 /* "MAXMIND" */, // maxmind src + SOURCE_UNITED_UID_YANDEX_MAPS = 19 /* "UNITED_UID_YANDEX_MAPS" */, // uuid src + SOURCE_RELIABILITY_AROUND = 20 /* "RELIABILITY_AROUND" */, // rel-around src + SOURCE_UNITED_UID_WEATHER = 21 /* "UNITED_UID_WEATHER" */, // uuid src + SOURCE_UNITED_UID_YANDEX_GID = 22 /* "UNITED_UID_YANDEX_GID" */, // uuid src + SOURCE_UNITED_UID_SEARCH_QUERY = 23 /* "UNITED_UID_SEARCH_QUERY" */, // uuid src + SOURCE_UNITED_UID_SEARCH_IN_REG = 24 /* "UNITED_UID_SEARCH_IN_REG" */, // uuid src + SOURCE_BGP_ASPATH_COMMUNITY = 25 /* "BGP_ASPATH_COMMUNITY" */, // bgp src // NOTA BENE: clash with https://st.yandex-team.ru/IPREG-3722#5b367ec214778c001a5a3f7c + SOURCE_ML_INT_26 = 26 /* "ML_INT_26" */, + SOURCE_ML_INT_27 = 27 /* "ML_INT_27" */, + SOURCE_ML_INT_28 = 28 /* "ML_INT_28" */, + SOURCE_ML_INT_29 = 29 /* "ML_INT_29" */, + SOURCE_ML_INT_30 = 30 /* "ML_INT_30" */, + SOURCE_ML_INT_31 = 31 /* "ML_INT_31" */, + SOURCE_ML_INT_32 = 32 /* "ML_INT_32" */, + SOURCE_ML_INT_33 = 33 /* "ML_INT_33" */, + SOURCE_ML_INT_34 = 34 /* "ML_INT_34" */, + SOURCE_PRECISE_GEO_ML = 35 /* "ML_INT_35" */, + SOURCE_ML = 36 /* "ML" */, // ml src +}; + +double GetSourceCoefficient(ui32 type); +bool SourceWantApplyDepthCoeff(ui32 source_type); +bool SourceWantApplyNetsizeCoeff(ui32 source_type); +bool SourceIsHuman(ui32 source_type); +bool SourceExcludeFromReliability(ui32 source_type); +bool SourceIsForRegionNormalize(ui32 source_type); +bool SourceIsForEnoughHumanData(ui32 source_type); +bool SourceIsForFewHumanData(ui32 source_type); +bool SourceIsForReliability(ui32 source_type); + +void SetCoefficient(ui32 type, ui32 value); +} // namespace NIPREG diff --git a/library/cpp/ipreg/split.cpp b/library/cpp/ipreg/split.cpp new file mode 100644 index 0000000000..19b7b85d51 --- /dev/null +++ b/library/cpp/ipreg/split.cpp @@ -0,0 +1,54 @@ +#include "split.h" + +#include <util/generic/list.h> +#include <util/generic/vector.h> + +namespace NIPREG { + +void SplitIPREG(TReader &reader, std::function<void(const TAddress& first, const TAddress& last, const TVector<TString>& data)>&& proc) { + TList<TGenericEntry> prevEntries; + + bool end; + do { + end = !reader.Next(); + + while (!prevEntries.empty() && (end || prevEntries.front().First < reader.Get().First)) { + // find smallest common range to process + TAddress first = prevEntries.front().First; + TAddress last = end ? TAddress::Highest() : reader.Get().First.Prev(); + + for (const auto& entry: prevEntries) + last = Min(last, entry.Last); + + // extract data for the range + TVector<TString> strings; + auto item = prevEntries.begin(); + while (item != prevEntries.end()) { + Y_ASSERT(item->First == first); + strings.push_back(item->Data); + + if (item->Last == last) { + // item completely processed, remove + auto victim = item; + item++; + prevEntries.erase(victim); + } else { + // item still have part of range left, update it + item->First = last.Next(); + item++; + } + } + + proc(first, last, strings); + } + + if (!end) { + if (!prevEntries.empty()) { + Y_ASSERT(prevEntries.front().First == reader.Get().First); + } + prevEntries.push_back(reader.Get()); + } + } while (!end); +} + +} diff --git a/library/cpp/ipreg/split.h b/library/cpp/ipreg/split.h new file mode 100644 index 0000000000..9710ff5f6d --- /dev/null +++ b/library/cpp/ipreg/split.h @@ -0,0 +1,13 @@ +#pragma once + +#include "reader.h" + +#include <util/generic/vector.h> + +#include <functional> + +namespace NIPREG { + +void SplitIPREG(TReader &reader, std::function<void(const TAddress& first, const TAddress& last, const TVector<TString>& data)>&& proc); + +} diff --git a/library/cpp/ipreg/stopwatch.cpp b/library/cpp/ipreg/stopwatch.cpp new file mode 100644 index 0000000000..31d99d2758 --- /dev/null +++ b/library/cpp/ipreg/stopwatch.cpp @@ -0,0 +1,53 @@ +#include "stopwatch.h" + +#include <util/stream/str.h> + +namespace NIPREG { + +TStopWatch::TStopWatch() { + Start = TInstant::Now(); +} + +TStopWatch::~TStopWatch() { + try { + if (TaskRunning) + StopTask(); + + Cerr << "Everything done in " << FormatTime(TInstant::Now() - Start) << Endl; + } catch (...) { + // not much problem if we can't write the summary + } +} + +void TStopWatch::StartTask(const TString& message) { + StopTask(); + + ++TaskOrdNum; + TaskStart = TInstant::Now(); + TaskRunning = true; + Cerr << TaskOrdNum << ". " << message << "...\n"; +} + +void TStopWatch::StopTask() { + if (TaskRunning) { + Cerr << "Done in " << FormatTime(TInstant::Now() - TaskStart) << Endl; + TaskRunning = false; + } +} + +TString TStopWatch::FormatTime(const TDuration& dur) { + auto sec = dur.Seconds(); + + TStringStream ss; + + if (sec < 60) + ss << sec << "s"; + else if (sec < 3600) + ss << sec / 60 << "m " << sec % 60 << "s"; + else + ss << sec / 3600 << "h " << (sec / 60) % 60 << "m"; + + return ss.Str(); +} + +} diff --git a/library/cpp/ipreg/stopwatch.h b/library/cpp/ipreg/stopwatch.h new file mode 100644 index 0000000000..0873a638f6 --- /dev/null +++ b/library/cpp/ipreg/stopwatch.h @@ -0,0 +1,25 @@ +#pragma once + +#include <util/datetime/base.h> + +namespace NIPREG { + +class TStopWatch { +private: + TInstant Start; + TInstant TaskStart; + bool TaskRunning = false; + ui32 TaskOrdNum = 0; + +private: + TString FormatTime(const TDuration& dur); + +public: + TStopWatch(); + ~TStopWatch(); + + void StartTask(const TString& message); + void StopTask(); +}; + +} diff --git a/library/cpp/ipreg/util_helpers.cpp b/library/cpp/ipreg/util_helpers.cpp new file mode 100644 index 0000000000..1b64baef55 --- /dev/null +++ b/library/cpp/ipreg/util_helpers.cpp @@ -0,0 +1,705 @@ +#include "util_helpers.h" + +#include <library/cpp/ipreg/reader.h> + +#include <library/cpp/json/json_reader.h> +#include <library/cpp/json/json_value.h> +#include <library/cpp/json/json_writer.h> + +#include <library/cpp/geobase/lookup.hpp> + +#include <util/generic/ptr.h> +#include <util/generic/vector.h> +#include <util/stream/file.h> +#include <util/stream/format.h> +#include <util/string/split.h> +#include <util/string/vector.h> +#include <util/stream/str.h> + +namespace NIPREG { + namespace { + double FindNearestCoarsedCoeff(double baseValue) { + using ValueStepPair = std::pair<double, double>; + static const double fix = 0.01; + static const TVector<ValueStepPair> limits = { + { 100., 20. + fix }, + { 500., 50. + fix }, + { 2500., 100. + fix }, + { 10000., 1000. + fix }, + { 50000., 10000. + fix } + }; + + double last_step{}; + for (const auto& pair : limits) { + last_step = pair.second; + if (baseValue <= pair.first) { + break; + } + } + return last_step; + } + + double CalcCoarsedValue(double baseValue) { + if (baseValue < 0.) { + ythrow yexception() << "negative value detected: " << baseValue; + } + + // TODO(dieash) some "strange" calculation below + const auto coarsedCoeff = FindNearestCoarsedCoeff(baseValue); + const double fixedValue = coarsedCoeff * static_cast<int>((baseValue + coarsedCoeff / 2) / coarsedCoeff); + return fixedValue; + } + + const char * const REL_FIELD = "reliability"; + const char * const REG_FIELD = "region_id"; + + void CorrectReliability(NJson::TJsonValue& jsonData, const TString& data) { + jsonData = ParseJsonString(data); + auto& jsonMap = jsonData.GetMapSafe(); + + auto& reliabilityField = jsonMap[REL_FIELD]; + reliabilityField = CalcCoarsedValue(reliabilityField.GetDouble()); + } + + TString SortJson(const TString& data) { + NJson::TJsonValue json = ParseJsonString(data); + return SortJsonData(json); + } + + static TString MergeJsonsData(const TString& data1, const TString& data2, bool sortKeys = false, bool countMerge = false) { + static const char* MERGE_QTY = "_mrg_qty_"; + + auto json1 = ParseJsonString(data1); + const auto& json2 = ParseJsonString(data2); + + if (countMerge && !json1.Has(MERGE_QTY)) { + json1.InsertValue(MERGE_QTY, 1); + } + + for (const auto& item : json2.GetMapSafe()) { + json1.InsertValue(item.first, item.second); + } + + if (countMerge) { + json1.InsertValue(MERGE_QTY, (json1[MERGE_QTY].GetInteger() + 1)); + } + + const auto NoFormat = false; + return NJson::WriteJson(json1, NoFormat, sortKeys); + } + + bool IsJsonEquals(const TVector<TString>& excludeFieldsList, const TString& data1, const TString& data2) { + if (excludeFieldsList.empty()) { + return data1 == data2; + } + + auto json1 = ParseJsonString(data1); + auto json2 = ParseJsonString(data2); + + for (const auto& excludeField : excludeFieldsList) { + json1.EraseValue(excludeField); + json2.EraseValue(excludeField); + } + + return json1 == json2; + } + + class Patcher { + public: + Patcher(TReader& base, TReader& patch, IOutputStream& output, bool sortData) + : BaseStream(base) + , PatchStream(patch) + , Output(output) + , SortData(sortData) + { + GetNext(BaseStream, BaseRangePtr); + GetNext(PatchStream, PatchRangePtr); + } + + void Process() { + while (BaseRangePtr || PatchRangePtr) { + if ( CheckPatch() + || OnlySecond(BaseRangePtr, PatchRangePtr, PatchStream) + || OnlySecond(PatchRangePtr, BaseRangePtr, BaseStream) + || Range1BeforeRange2(BaseRangePtr, PatchRangePtr, BaseStream) + || Range1BeforeRange2(PatchRangePtr, BaseRangePtr, PatchStream) + || FirstEndInSecond(BaseRangePtr, PatchRangePtr) + || FirstEndInSecond(PatchRangePtr, BaseRangePtr) + || FirstStartInSecond(BaseRangePtr, PatchRangePtr, BaseStream, PatchStream)) + { + continue; + } + } + } + + private: + void GetNext(TReader& stream, TAutoPtr<TRange>& rangePtr) { + if (stream.Next()) { + if (rangePtr) { + *rangePtr = stream.Get(); + } else { + rangePtr.Reset(new TRange(stream.Get())); + } + } + else { + rangePtr.Reset(); + } + } + + void Print(const TRange& range) const { + Output << range; + } + + void PrintSorted(const TRange& range) const { + const TRange sortedCopy{range.First, range.Last, SortJson(range.Data)}; + Output << sortedCopy; + } + + bool CheckPatch() { + if (PatchRangePtr && PatchRangePtr->First > PatchRangePtr->Last) { + GetNext(PatchStream, PatchRangePtr); + return true; + } + return false; + } + + bool OnlySecond(TAutoPtr<TRange>& first, TAutoPtr<TRange>& second, TReader& stream) { + if (!first && second) { + Print(*second); + GetNext(stream, second); + return true; + } + return false; + } + + bool Range1BeforeRange2(TAutoPtr<TRange>& first, TAutoPtr<TRange>& second, TReader& stream) { + if (first->Last < second->First) { + Print(*first); + GetNext(stream, first); + return true; + } + return false; + } + + bool FirstEndInSecond(TAutoPtr<TRange>& first, TAutoPtr<TRange>& second) { + if (first->First < second->First) { + auto leftBaseRange = *first; + leftBaseRange.Last = second->First.Prev(); + Print(leftBaseRange); + + first->First = second->First; + return true; + } + return false; + } + + bool FirstStartInSecond(TAutoPtr<TRange>& first, TAutoPtr<TRange>& second, TReader& stream1, TReader& stream2) { + if (first->First >= second->First) { + auto leftBaseRange = *first; + leftBaseRange.Data = MergeJsonsData(first->Data, second->Data); + + if (first->Last <= second->Last) { + second->First = first->Last.Next(); + GetNext(stream1, first); + if (second->First == TAddress::Highest()) { + GetNext(stream2, second); + } + } else { + leftBaseRange.Last = second->Last; + first->First = second->Last.Next(); + GetNext(stream2, second); + } + + SortData ? PrintSorted(leftBaseRange) : Print(leftBaseRange); + return true; + } + return false; + } + + private: + TAutoPtr<TRange> BaseRangePtr; + TAutoPtr<TRange> PatchRangePtr; + + TReader& BaseStream; + TReader& PatchStream; + IOutputStream& Output; + const bool SortData = false; + }; + + struct IpChecker { + static void LessOrEqual(const size_t row, const TAddress& lastIp, const TAddress& checkedIp) { + if (lastIp <= checkedIp) { + return; + } + GenErr(row, " <= ", lastIp, checkedIp); + } + + static void Less(const size_t row, const TAddress& lastIp, const TAddress& checkedIp) { + if (lastIp < checkedIp) { + return; + } + GenErr(row, " < ", lastIp, checkedIp); + } + + static void GenErr(const size_t row, const char* msg, const TAddress& lastIp, const TAddress& checkedIp) { + const TString& errMsg = ">>> row#" + ToString(row) + "; " + lastIp.AsIPv6() + msg + checkedIp.AsIPv6(); + throw std::runtime_error(errMsg.data()); + } + }; + + class MergerBy3 { + public: + MergerBy3(const TString& geodataPath, IOutputStream& output) + : Geobase(geodataPath) + , Out(output) + {} + + void Process(TReader& input, bool ByRegsOnly, bool silentMode) { + while (input.Next()) { + Trio.push_back(input.Get()); + if (3 > Trio.size()) { + continue; + } + + auto& range2Data = (++Trio.begin())->Data; + if (range2Data.npos != range2Data.find("\"is_placeholder\":1")) { + PrintAndDrop1stRange(); + PrintAndDrop1stRange(); + continue; + } + + const auto range1RegId = GetRegionId(Trio.begin()->Data); + const auto range3RegId = GetRegionId(Trio.rbegin()->Data); + if (range1RegId != range3RegId) { + PrintAndDrop1stRange(); + continue; + } + + const auto range2RegId = GetRegionId(range2Data); + const auto& parentsIds = Geobase.GetParentsIds(range1RegId); + if (parentsIds.end() == std::find(parentsIds.begin() + 1, parentsIds.end(), range2RegId)) { + PrintAndDrop1stRange(); + continue; + } + + if (!ByRegsOnly) { + const auto range1Size = Trio.begin()->GetAddrsQty(); + const auto range2Size = (++Trio.begin())->GetAddrsQty(); + const auto range3Size = Trio.rbegin()->GetAddrsQty(); + + if (range2Size > (range1Size + range3Size)) { + PrintAndDrop1stRange(); + continue; + } + } + + range2Data = SubstRegionId(range2Data, range1RegId); + if (!silentMode) { + PrintSubstNote(range2RegId, range1RegId); + } + + PrintAndDrop1stRange(); // 1st + PrintAndDrop1stRange(); // 2nd + } + + while (Trio.end() != Trio.begin()) { + PrintAndDrop1stRange(); + } + } + private: + void PrintAndDrop1stRange() { + Out << *Trio.begin(); + Trio.erase(Trio.begin()); + } + + void PrintSubstNote(const int oldId, const int newId) { + const bool NoData = false; + Cerr << "s/" << oldId << "/" << newId << "/: ["; + + Trio.begin()->DumpTo(Cerr, NoData); + Cerr << "/" << Trio.begin()->GetAddrsQty() << " | "; + + const auto& range2nd = *(++Trio.begin()); + range2nd.DumpTo(Cerr, NoData); + Cerr << "/" << range2nd.GetAddrsQty() << " | "; + + Trio.rbegin()->DumpTo(Cerr, NoData); + Cerr << "/" << Trio.rbegin()->GetAddrsQty() << "]\n"; + } + + + static int GetRegionId(const TString& data) { + const auto& json = ParseJsonString(data); + auto reg_id = json["region_id"].GetIntegerSafe(0); + return 99999 == reg_id ? 10000 : reg_id; + } + + static TString SubstRegionId(const TString& data, const int newId) { + auto json = ParseJsonString(data); + json.InsertValue("region_id", newId); + return SortJsonData(json); + } + + const NGeobase::TLookup Geobase; + IOutputStream& Out; + TList<TRange> Trio; + }; + } // anon-ns + + void DoCoarsening(IInputStream& input, IOutputStream& output) { + TString line; + while (input.ReadLine(line)) { + TVector<TString> parts; + StringSplitter(line).Split('\t').AddTo(&parts); + + NJson::TJsonValue jsonData; + CorrectReliability(jsonData, parts[1]); + output << parts[0] << "\t" << "{\"" + << REG_FIELD << "\":" << jsonData[REG_FIELD] << ",\"" + << REL_FIELD << "\":" << Prec(jsonData[REL_FIELD].GetDouble(), PREC_POINT_DIGITS_STRIP_ZEROES, 2) + << "}\n"; + } + } + + void DoMergeEqualsRange(TReader& input, IOutputStream& output) { + // TODO(dieash@) may be check region for parent/child relation + // , const TString& geodataPath + // NGeobase::TLookup geoLookup(geodataPath); + + TVector<TString> rangeDataList; + TRange lastRange{}; + + const char* REG_ID_ATTR = "region_id"; + const char* ORG_NET_ATTR = "orig_net_size"; + const char* HUGE_SIZE_VALUE = "huge"; + + const int HUGE_SIZE_COEFF = 100; + + const auto CalcRegionBinding = [&]() { + if (rangeDataList.empty()) { + throw std::runtime_error("empty data list"); + } + + if (1 == rangeDataList.size()) { + return rangeDataList[0]; + } + + size_t maxAmount{}; + NJson::TJsonValue maxData; + + THashMap<NGeobase::TId, size_t> reg2amount; + for (const auto& data : rangeDataList) { + const auto& json = ParseJsonString(data); + + const auto id = json[REG_ID_ATTR].GetInteger(); + const auto amount = (json.Has(ORG_NET_ATTR) && HUGE_SIZE_VALUE == json[ORG_NET_ATTR].GetString()) ? HUGE_SIZE_COEFF : FromString<int>(json[ORG_NET_ATTR].GetString()); + reg2amount[id] += amount; + + if (reg2amount[id] > maxAmount) { + maxData = json; + } + } + + maxData.EraseValue(ORG_NET_ATTR); + return SortJsonData(maxData); + }; + + const auto PrintRow = [&]() { + if (rangeDataList.empty()) { + return; + } + lastRange.Data = CalcRegionBinding(); + output << lastRange; + }; + + while (input.Next()) { + auto currRange = input.Get(); + if (currRange != lastRange) { + PrintRow(); + + lastRange = currRange; + rangeDataList = {}; + } + + rangeDataList.push_back(currRange.Data); + } + PrintRow(); + } + + void DoMerging(TReader& input, IOutputStream& output, const MergeTraits& traits) { + if (!input.Next()) { + return; // empty file here + } + + const bool IsJsonData = traits.ConcatSep.empty(); + + TRange joinedRange = input.Get(); + if (traits.SortData) { + joinedRange.Data = SortJson(joinedRange.Data); + } + + while (input.Next()) { + auto currRange = input.Get(); + if (traits.SortData) { + currRange.Data = SortJson(currRange.Data); + } + + if (currRange.Contains(joinedRange) && joinedRange.Data == currRange.Data) { + joinedRange = currRange; + continue; + } + + if (traits.JoinNestedRanges && joinedRange.Contains(currRange) && joinedRange.Data == currRange.Data) { + continue; + } + + if ( currRange.First != joinedRange.Last.Next() + || ( IsJsonData && !IsJsonEquals(traits.ExcludeFieldsList, currRange.Data, joinedRange.Data)) + || (!IsJsonData && currRange.Data != joinedRange.Data)) + { + output << joinedRange; + joinedRange = currRange; + } else { + if (IsJsonData) { + joinedRange.Data = MergeJsonsData(currRange.Data, joinedRange.Data, traits.SortData, traits.CountMerges); + } else { + joinedRange.Data = (joinedRange.Data == currRange.Data) ? joinedRange.Data : (joinedRange.Data + traits.ConcatSep + currRange.Data); + } + joinedRange.Last = currRange.Last; + } + } + + output << joinedRange; + } + + void DoMerging3(TReader& input, IOutputStream& output, const TString& geodata, bool ByRegsOnly, bool silentMode) { + MergerBy3 merger(geodata, output); + merger.Process(input, ByRegsOnly, silentMode); + } + + void DoPatching(TReader& base, TReader& patch, IOutputStream& output, bool sortData) { + Patcher(base, patch, output, sortData).Process(); + } + + const TString STUB_DATA{"{\"is_placeholder\":1,\"region_id\":10000,\"reliability\":0}"}; + + void AddStubRanges(TReader& input, IOutputStream& output) { + TRange stub{ + TAddress::Lowest(), + TAddress::Lowest(), + STUB_DATA + }; + + while (input.Next()) { + const auto& currRange = input.Get(); + + if (stub.First > currRange.First) { + const TString& errMsg = ">>> bad ranges ($stub.begin > $next.begin) // " + stub.First.AsShortIPv6() + " | " + currRange.First.AsShortIPv6(); + throw std::runtime_error(errMsg.data()); + } + + if (stub.First < currRange.First) { + stub.Last = currRange.First.Prev(); + output << stub; + } + + output << currRange; + stub.First = currRange.Last.Next(); + } + + if (stub.First != TAddress::Highest()) { + stub.Last = TAddress::Highest(); + output << stub; + } + } + + void CheckAddressSpaceForCompleteness(IInputStream& input, IOutputStream& output) { + TAddress lastIp = TAddress::Lowest(); + size_t row_number = 0; + + TString line; + while (input.ReadLine(line)) { + ++row_number; + output << line << "\n"; + + const auto& currRange = TRange::BuildRange(line); + if (row_number == 1) { + if (currRange.First != TAddress::Lowest()) { + const TString err_msg = "bad first addr (ip / wanted_ip) => " + currRange.First.AsIPv6() + " / " + TAddress::Lowest().AsIPv6(); + throw std::runtime_error(err_msg); + } + lastIp = currRange.Last; + continue; + } + + if (lastIp == currRange.First || lastIp.Next() != currRange.First) { + const TString err_msg = ">>> row#" + ToString(row_number) + " bad pair (last_ip / next_ip) => " + lastIp.AsIPv6() + " / " + currRange.First.AsIPv6(); + throw std::runtime_error(err_msg); + } + + lastIp = currRange.Last; + } + + if (lastIp != TAddress::Highest()) { + const TString err_msg = "bad last addr (last_ip / wanted_ip) => " + lastIp.AsIPv6() + " / " + TAddress::Highest().AsIPv6(); + throw std::runtime_error(err_msg); + } + } + + void CheckRangesForMonotonicSequence(IInputStream& input, IOutputStream& output, bool IsStrict) { + TAddress lastIp = TAddress::Lowest(); + + size_t row = 0; + TString line; + while (input.ReadLine(line)) { + ++row; + output << line << "\n"; + + const auto& currRange = TRange::BuildRange(line); + if (row == 1) { + lastIp = currRange.Last; + continue; + } + + if (IsStrict) { + IpChecker::Less(row, lastIp, currRange.First); + } else { + IpChecker::LessOrEqual(row, lastIp, currRange.First); + } + lastIp = currRange.Last; + } + } + + NJson::TJsonValue ParseJsonString(const TString& data) { + const auto throwIfError = true; + + NJson::TJsonValue json; + NJson::ReadJsonFastTree(data, &json, throwIfError); + return json; + } + + TString SortJsonData(const NJson::TJsonValue& json) { + const auto NoFormat = false; + const auto SortKeys = true; + + return NJson::WriteJson(json, NoFormat, SortKeys); + } + + TString SortJsonData(const TString& jsonStr) { + return SortJsonData(ParseJsonString(jsonStr)); + } + + TString AddJsonAttrs(const TVector<TString>& addFieldsList, const TString& jsonStr, const TMaybe<TString>& attrValue) { + if (addFieldsList.empty()) { + return jsonStr; + } + + auto json = ParseJsonString(jsonStr); + for (const auto& newField : addFieldsList) { + if (!newField.empty()) { + if (attrValue) { + json.InsertValue(newField, *attrValue); + } else { + json.InsertValue(newField, 1); + } + } + } + return json.GetStringRobust(); + } + + TString ExcludeJsonAttrs(const TVector<TString>& excludeFieldsList, const TString& jsonStr) { + if (excludeFieldsList.empty()) { + return jsonStr; + } + + auto json = ParseJsonString(jsonStr); + for (const auto& excludeField : excludeFieldsList) { + if (!excludeField.empty()) { + json.EraseValue(excludeField); + } + } + return json.GetStringRobust(); + } + + TString ExtractJsonAttrs(const TVector<TString>& extractFieldsList, const TString& jsonStr) { + if (extractFieldsList.empty()) { + return jsonStr; + } + + auto json = ParseJsonString(jsonStr); + NJson::TJsonValue newJson; + for (const auto& field : extractFieldsList) { + if (json.Has(field)) { + newJson.InsertValue(field, json[field]); + } + } + if (!newJson.IsDefined()) { + return {}; + } + return newJson.GetStringRobust(); + } + + namespace CliParamsDesc { + const TString InputFnameParam = "input-data"; + const TString OutputFnameParam = "output-data"; + const TString OutputFullIpParam = "show-full-ip"; + const TString PrintStatsParam = "print-stats"; + const TString PrintYtStatsParam = "yt-stats"; + + const TString InputFnameParamDesc = "path to input IPREG-data; leave empty or use '-' for stdin"; + const TString OutputFnameParamDesc = "path to file for output results; leave empty for stdout"; + const TString OutputFullIpParamDesc = "print full ipv6 (by default - short)"; + const TString PrintStatsParamDesc = "print internal statistics; @stderr"; + const TString PrintYtStatsParamDesc = "print YT-stats (by default, file-descriptor 5)"; + } // ns CliParamsDesc + + DefaultCliParams::DefaultCliParams() { + using namespace CliParamsDesc; + + Opts.SetFreeArgsMax(0); + Opts.AddHelpOption('h'); + + Opts.AddLongOption('i', InputFnameParam) + .RequiredArgument("filename") + .DefaultValue(InputFname) + .StoreResult(&InputFname).Help(InputFnameParamDesc); + + Opts.AddLongOption('o', OutputFnameParam) + .RequiredArgument("filename") + .DefaultValue(OutputFname) + .StoreResult(&OutputFname).Help(OutputFnameParamDesc); + + Opts.AddLongOption('f', OutputFullIpParam) + .Optional() + .NoArgument() + .DefaultValue("0") + .OptionalValue("1") + .StoreResult(&OutputFullIp).Help(OutputFullIpParamDesc); + + Opts.AddLongOption(PrintStatsParam) + .Optional() + .NoArgument() + .DefaultValue("0") + .OptionalValue("1") + .StoreResult(&PrintStats).Help(PrintStatsParamDesc); + + Opts.AddLongOption(PrintYtStatsParam) + .Optional() + .NoArgument() + .DefaultValue("0") + .OptionalValue("1") + .StoreResult(&PrintYtStats).Help(PrintYtStatsParamDesc); + } + + void DefaultCliParams::ApplyFlags() const { + if (OutputFullIp) { + SetIpFullOutFormat(); + } + } + + void DefaultCliParams::Parse(int argc, const char **argv) { + NLastGetopt::TOptsParseResult optRes(&GetOpts(), argc, argv); + ApplyFlags(); + } + +} // NIPREG diff --git a/library/cpp/ipreg/util_helpers.h b/library/cpp/ipreg/util_helpers.h new file mode 100644 index 0000000000..eab2dfb320 --- /dev/null +++ b/library/cpp/ipreg/util_helpers.h @@ -0,0 +1,65 @@ +#pragma once + +#include <library/cpp/getopt/opt.h> +#include <util/generic/string.h> +#include <util/generic/maybe.h> + +class IInputStream; +class IOutputStream; + +namespace NJson { + class TJsonValue; +} + +namespace NIPREG { + class TReader; + + // @input any form of range+payload + // @output $ip.begin-$ip.end \t {"region_id":$reg,"reliability":$rel} + void DoCoarsening(IInputStream& input, IOutputStream& output); + + struct MergeTraits { + const TVector<TString> ExcludeFieldsList; + TString ConcatSep; + bool SortData{}; + bool CountMerges{}; + bool JoinNestedRanges{}; + }; + + void DoMerging(TReader& input, IOutputStream& output, const MergeTraits& traits); + void DoMerging3(TReader& input, IOutputStream& output, const TString& geodata, bool ByRegsOnly = false, bool silentMode = false); + void DoMergeEqualsRange(TReader& input, IOutputStream& output); + + void DoPatching(TReader& base, TReader& patch, IOutputStream& output, bool sortData = false); + + void AddStubRanges(TReader& input, IOutputStream& output); + + void CheckAddressSpaceForCompleteness(IInputStream& input, IOutputStream& output); + void CheckRangesForMonotonicSequence(IInputStream& input, IOutputStream& output, bool IsStrict = false); + + NJson::TJsonValue ParseJsonString(const TString& data); + TString SortJsonData(const NJson::TJsonValue& json); + TString SortJsonData(const TString& json); + + TString AddJsonAttrs(const TVector<TString>& addFieldsList, const TString& jsonStr, const TMaybe<TString>& attrValue); + TString ExcludeJsonAttrs(const TVector<TString>& excludeFieldsList, const TString& jsonStr); + TString ExtractJsonAttrs(const TVector<TString>& excludeFieldsList, const TString& jsonStr); + + extern const TString STUB_DATA; + + struct DefaultCliParams { + DefaultCliParams(); + + NLastGetopt::TOpts& GetOpts() { return Opts; } + void Parse(int argc, const char **argv); + void ApplyFlags() const; + + TString InputFname = "-"; + TString OutputFname = ""; + bool OutputFullIp = false; + bool PrintStats = false; + bool PrintYtStats = false; + + NLastGetopt::TOpts Opts; + }; +} // NIPREG diff --git a/library/cpp/ipreg/writer.cpp b/library/cpp/ipreg/writer.cpp new file mode 100644 index 0000000000..89f8c8b629 --- /dev/null +++ b/library/cpp/ipreg/writer.cpp @@ -0,0 +1,91 @@ +#include "writer.h" + +#include <util/stream/file.h> + +namespace NIPREG { + +TWriter::TWriter(const TString& fname) + : OwnedStreamPtr(fname.empty() ? nullptr : new TFileOutput(fname)) + , Stream(OwnedStreamPtr ? *OwnedStreamPtr.Get() : Cout) + , AddrSeparator(ADDR_SEP) + , DataSeparator(DATA_SEP) + , SplitMixed(false) +{ +} + +TWriter::TWriter(IOutputStream& stream, EAddressFormat addressFormat, const TString& addrSep, const TString& dataSep, const bool splitMixed) + : Stream(stream) + , AddressFormat(addressFormat) + , AddrSeparator(addrSep) + , DataSeparator(dataSep) + , SplitMixed(splitMixed) +{ +} + +namespace { + const TAddress IPv4Start = TAddress::ParseIPv4("0.0.0.0"); + const TAddress IPv4End = TAddress::ParseIPv4("255.255.255.255"); + + const TAddress IPv6BeforeV4 = IPv4Start.Prev(); + const TAddress IPv6AfterV4 = IPv4End.Next(); +} + +void TWriter::Write(const TAddress& first, const TAddress& last, const TString& data, bool printRange) { + if (SplitMixed) { + if (first < IPv4Start && IPv4Start < last) { + Write(first, IPv6BeforeV4, data, printRange); + Write(IPv4Start, last, data, printRange); + return; + } + + if (first < IPv4End && IPv4End < last) { + Write(first, IPv4End, data, printRange); + Write(IPv6AfterV4, last, data, printRange); + return; + } + } + WriteImpl(first, last, data, printRange); +} + +void TWriter::WriteImpl(const TAddress& first, const TAddress& last, const TString& data, bool printRange) { + if (printRange) { + Stream << first.Format(AddressFormat) << AddrSeparator << last.Format(AddressFormat); + } + if (!data.empty()) { + if (printRange) { + Stream << DataSeparator; + } + Stream << data; + } + if (!data.empty() || printRange) { + Stream << "\n"; + } +} + +void TWriter::Finalize() { +} + +TMergingWriter::TMergingWriter(IOutputStream& stream, EAddressFormat addressFormat, const TString& addrSep, const TString& dataSep, const bool splitMixed) + : TWriter(stream, addressFormat, addrSep, dataSep, splitMixed) { +} + +void TMergingWriter::Write(const TAddress& first, const TAddress& last, const TString& data, bool) { + if (Initialized && data == StoredData && first == StoredLast.Next()) { + StoredLast = last; + } else { + if (Initialized) + TWriter::Write(StoredFirst, StoredLast, StoredData); + StoredFirst = first; + StoredLast = last; + StoredData = data; + Initialized = true; + } +} + +void TMergingWriter::Finalize() { + if (Initialized) + TWriter::Write(StoredFirst, StoredLast, StoredData); + Initialized = false; +} + +} // NIPREG diff --git a/library/cpp/ipreg/writer.h b/library/cpp/ipreg/writer.h new file mode 100644 index 0000000000..a4232a89a6 --- /dev/null +++ b/library/cpp/ipreg/writer.h @@ -0,0 +1,62 @@ +#pragma once + +#include "range.h" + +#include <util/generic/ptr.h> +#include <util/generic/string.h> +#include <util/stream/output.h> + +namespace NIPREG { + +class TWriter { +public: + static constexpr char const * const ADDR_SEP = "-"; + static constexpr char const * const DATA_SEP = "\t"; + +public: + TWriter(const TString& filename = ""); + TWriter(IOutputStream& stream, EAddressFormat addressFormat = EAddressFormat::IPV6, const TString& addrSep = ADDR_SEP, const TString& dataSep = DATA_SEP, const bool splitMixed = false); + TWriter(IOutputStream& stream, const TString& addrSep, EAddressFormat addressFormat) + : TWriter(stream, addressFormat, addrSep, addrSep) + {} + virtual ~TWriter() {} + + void Write(const TGenericEntry& entry, bool printRange = true) { + Write(entry.First, entry.Last, entry.Data, printRange); + } + virtual void Write(const TAddress& first, const TAddress& last, const TString& data, bool printRange = true); + virtual void Finalize(); + + operator IOutputStream&() { + return Stream; + } + +private: + void WriteImpl(const TAddress& first, const TAddress& last, const TString& data, bool printRange); + + TAutoPtr<IOutputStream> OwnedStreamPtr; + IOutputStream& Stream; + + EAddressFormat AddressFormat = EAddressFormat::IPV6; + const TString AddrSeparator = ADDR_SEP; + const TString DataSeparator = DATA_SEP; + const bool SplitMixed; +}; + +class TMergingWriter : public TWriter { +public: + TMergingWriter(IOutputStream& stream, EAddressFormat addressFormat = EAddressFormat::IPV6, const TString& addrSep = ADDR_SEP, const TString& dataSep = DATA_SEP, const bool splitMixed = false); + TMergingWriter(IOutputStream& stream, const TString& addrSep, EAddressFormat addressFormat) + : TWriter(stream, addressFormat, addrSep, addrSep) + {} + void Write(const TAddress& first, const TAddress& last, const TString& data, bool printRange = true) final override; + void Finalize() final; + +private: + TAddress StoredFirst; + TAddress StoredLast; + TString StoredData; + bool Initialized = false; +}; + +} // NIPREG diff --git a/library/cpp/ipreg/ya.make b/library/cpp/ipreg/ya.make new file mode 100644 index 0000000000..b03720f761 --- /dev/null +++ b/library/cpp/ipreg/ya.make @@ -0,0 +1,26 @@ +LIBRARY() + +SRCS( + address.cpp + checker.cpp + merge.cpp + range.cpp + reader.cpp + sources.cpp + split.cpp + stopwatch.cpp + writer.cpp + util_helpers.cpp +) + +PEERDIR( + library/cpp/getopt/small + library/cpp/json + library/cpp/geobase + library/cpp/int128 +) + +GENERATE_ENUM_SERIALIZATION(address.h) +GENERATE_ENUM_SERIALIZATION(sources.h) + +END() diff --git a/library/cpp/langmask/CMakeLists.txt b/library/cpp/langmask/CMakeLists.txt new file mode 100644 index 0000000000..499930c4b0 --- /dev/null +++ b/library/cpp/langmask/CMakeLists.txt @@ -0,0 +1,9 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +add_subdirectory(proto) diff --git a/library/cpp/langmask/proto/CMakeLists.darwin-x86_64.txt b/library/cpp/langmask/proto/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..e9f692d0f2 --- /dev/null +++ b/library/cpp/langmask/proto/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,43 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +get_built_tool_path( + TOOL_protoc_bin + TOOL_protoc_dependency + contrib/tools/protoc/bin + protoc +) +get_built_tool_path( + TOOL_cpp_styleguide_bin + TOOL_cpp_styleguide_dependency + contrib/tools/protoc/plugins/cpp_styleguide + cpp_styleguide +) + +add_library(cpp-langmask-proto) +target_link_libraries(cpp-langmask-proto PUBLIC + contrib-libs-cxxsupp + yutil + contrib-libs-protobuf +) +target_proto_messages(cpp-langmask-proto PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/langmask/proto/langmask.proto +) +target_proto_addincls(cpp-langmask-proto + ./ + ${CMAKE_SOURCE_DIR}/ + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src +) +target_proto_outs(cpp-langmask-proto + --cpp_out=${CMAKE_BINARY_DIR}/ + --cpp_styleguide_out=${CMAKE_BINARY_DIR}/ +) diff --git a/library/cpp/langmask/proto/CMakeLists.linux-aarch64.txt b/library/cpp/langmask/proto/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..61f975983e --- /dev/null +++ b/library/cpp/langmask/proto/CMakeLists.linux-aarch64.txt @@ -0,0 +1,44 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +get_built_tool_path( + TOOL_protoc_bin + TOOL_protoc_dependency + contrib/tools/protoc/bin + protoc +) +get_built_tool_path( + TOOL_cpp_styleguide_bin + TOOL_cpp_styleguide_dependency + contrib/tools/protoc/plugins/cpp_styleguide + cpp_styleguide +) + +add_library(cpp-langmask-proto) +target_link_libraries(cpp-langmask-proto PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + contrib-libs-protobuf +) +target_proto_messages(cpp-langmask-proto PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/langmask/proto/langmask.proto +) +target_proto_addincls(cpp-langmask-proto + ./ + ${CMAKE_SOURCE_DIR}/ + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src +) +target_proto_outs(cpp-langmask-proto + --cpp_out=${CMAKE_BINARY_DIR}/ + --cpp_styleguide_out=${CMAKE_BINARY_DIR}/ +) diff --git a/library/cpp/langmask/proto/CMakeLists.linux-x86_64.txt b/library/cpp/langmask/proto/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..61f975983e --- /dev/null +++ b/library/cpp/langmask/proto/CMakeLists.linux-x86_64.txt @@ -0,0 +1,44 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +get_built_tool_path( + TOOL_protoc_bin + TOOL_protoc_dependency + contrib/tools/protoc/bin + protoc +) +get_built_tool_path( + TOOL_cpp_styleguide_bin + TOOL_cpp_styleguide_dependency + contrib/tools/protoc/plugins/cpp_styleguide + cpp_styleguide +) + +add_library(cpp-langmask-proto) +target_link_libraries(cpp-langmask-proto PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + contrib-libs-protobuf +) +target_proto_messages(cpp-langmask-proto PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/langmask/proto/langmask.proto +) +target_proto_addincls(cpp-langmask-proto + ./ + ${CMAKE_SOURCE_DIR}/ + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src +) +target_proto_outs(cpp-langmask-proto + --cpp_out=${CMAKE_BINARY_DIR}/ + --cpp_styleguide_out=${CMAKE_BINARY_DIR}/ +) diff --git a/library/cpp/langmask/proto/CMakeLists.txt b/library/cpp/langmask/proto/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/library/cpp/langmask/proto/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/library/cpp/langmask/proto/CMakeLists.windows-x86_64.txt b/library/cpp/langmask/proto/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..e9f692d0f2 --- /dev/null +++ b/library/cpp/langmask/proto/CMakeLists.windows-x86_64.txt @@ -0,0 +1,43 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +get_built_tool_path( + TOOL_protoc_bin + TOOL_protoc_dependency + contrib/tools/protoc/bin + protoc +) +get_built_tool_path( + TOOL_cpp_styleguide_bin + TOOL_cpp_styleguide_dependency + contrib/tools/protoc/plugins/cpp_styleguide + cpp_styleguide +) + +add_library(cpp-langmask-proto) +target_link_libraries(cpp-langmask-proto PUBLIC + contrib-libs-cxxsupp + yutil + contrib-libs-protobuf +) +target_proto_messages(cpp-langmask-proto PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/langmask/proto/langmask.proto +) +target_proto_addincls(cpp-langmask-proto + ./ + ${CMAKE_SOURCE_DIR}/ + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src +) +target_proto_outs(cpp-langmask-proto + --cpp_out=${CMAKE_BINARY_DIR}/ + --cpp_styleguide_out=${CMAKE_BINARY_DIR}/ +) diff --git a/library/cpp/langmask/proto/langmask.proto b/library/cpp/langmask/proto/langmask.proto new file mode 100644 index 0000000000..be23ecfbba --- /dev/null +++ b/library/cpp/langmask/proto/langmask.proto @@ -0,0 +1,6 @@ +package NProto; + +message TLangMask { + repeated uint32 Bits = 1; // binary + optional string Names = 2; // human readable +} diff --git a/library/cpp/langmask/proto/ya.make b/library/cpp/langmask/proto/ya.make new file mode 100644 index 0000000000..823a0ad261 --- /dev/null +++ b/library/cpp/langmask/proto/ya.make @@ -0,0 +1,11 @@ +PROTO_LIBRARY() + +SRCS( + langmask.proto +) + +IF (NOT PY_PROTOS_FOR) + EXCLUDE_TAGS(GO_PROTO) +ENDIF() + +END() diff --git a/library/cpp/microbdb/CMakeLists.darwin-x86_64.txt b/library/cpp/microbdb/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..c4d2e9d3a4 --- /dev/null +++ b/library/cpp/microbdb/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,56 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +find_package(ZLIB REQUIRED) +get_built_tool_path( + TOOL_protoc_bin + TOOL_protoc_dependency + contrib/tools/protoc/bin + protoc +) +get_built_tool_path( + TOOL_cpp_styleguide_bin + TOOL_cpp_styleguide_dependency + contrib/tools/protoc/plugins/cpp_styleguide + cpp_styleguide +) + +add_library(library-cpp-microbdb) +target_link_libraries(library-cpp-microbdb PUBLIC + contrib-libs-cxxsupp + yutil + contrib-libs-fastlz + contrib-libs-libc_compat + contrib-libs-protobuf + contrib-libs-snappy + ZLIB::ZLIB + cpp-deprecated-fgood + cpp-on_disk-st_hash + library-cpp-packedtypes +) +target_proto_messages(library-cpp-microbdb PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/noextinfo.proto +) +target_sources(library-cpp-microbdb PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/file.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/header.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/microbdb.cpp +) +target_proto_addincls(library-cpp-microbdb + ./ + ${CMAKE_SOURCE_DIR}/ + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src +) +target_proto_outs(library-cpp-microbdb + --cpp_out=${CMAKE_BINARY_DIR}/ + --cpp_styleguide_out=${CMAKE_BINARY_DIR}/ +) diff --git a/library/cpp/microbdb/CMakeLists.linux-aarch64.txt b/library/cpp/microbdb/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..302dbd03cd --- /dev/null +++ b/library/cpp/microbdb/CMakeLists.linux-aarch64.txt @@ -0,0 +1,57 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +find_package(ZLIB REQUIRED) +get_built_tool_path( + TOOL_protoc_bin + TOOL_protoc_dependency + contrib/tools/protoc/bin + protoc +) +get_built_tool_path( + TOOL_cpp_styleguide_bin + TOOL_cpp_styleguide_dependency + contrib/tools/protoc/plugins/cpp_styleguide + cpp_styleguide +) + +add_library(library-cpp-microbdb) +target_link_libraries(library-cpp-microbdb PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + contrib-libs-fastlz + contrib-libs-libc_compat + contrib-libs-protobuf + contrib-libs-snappy + ZLIB::ZLIB + cpp-deprecated-fgood + cpp-on_disk-st_hash + library-cpp-packedtypes +) +target_proto_messages(library-cpp-microbdb PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/noextinfo.proto +) +target_sources(library-cpp-microbdb PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/file.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/header.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/microbdb.cpp +) +target_proto_addincls(library-cpp-microbdb + ./ + ${CMAKE_SOURCE_DIR}/ + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src +) +target_proto_outs(library-cpp-microbdb + --cpp_out=${CMAKE_BINARY_DIR}/ + --cpp_styleguide_out=${CMAKE_BINARY_DIR}/ +) diff --git a/library/cpp/microbdb/CMakeLists.linux-x86_64.txt b/library/cpp/microbdb/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..302dbd03cd --- /dev/null +++ b/library/cpp/microbdb/CMakeLists.linux-x86_64.txt @@ -0,0 +1,57 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +find_package(ZLIB REQUIRED) +get_built_tool_path( + TOOL_protoc_bin + TOOL_protoc_dependency + contrib/tools/protoc/bin + protoc +) +get_built_tool_path( + TOOL_cpp_styleguide_bin + TOOL_cpp_styleguide_dependency + contrib/tools/protoc/plugins/cpp_styleguide + cpp_styleguide +) + +add_library(library-cpp-microbdb) +target_link_libraries(library-cpp-microbdb PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + contrib-libs-fastlz + contrib-libs-libc_compat + contrib-libs-protobuf + contrib-libs-snappy + ZLIB::ZLIB + cpp-deprecated-fgood + cpp-on_disk-st_hash + library-cpp-packedtypes +) +target_proto_messages(library-cpp-microbdb PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/noextinfo.proto +) +target_sources(library-cpp-microbdb PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/file.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/header.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/microbdb.cpp +) +target_proto_addincls(library-cpp-microbdb + ./ + ${CMAKE_SOURCE_DIR}/ + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src +) +target_proto_outs(library-cpp-microbdb + --cpp_out=${CMAKE_BINARY_DIR}/ + --cpp_styleguide_out=${CMAKE_BINARY_DIR}/ +) diff --git a/library/cpp/microbdb/CMakeLists.txt b/library/cpp/microbdb/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/library/cpp/microbdb/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/library/cpp/microbdb/CMakeLists.windows-x86_64.txt b/library/cpp/microbdb/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..c4d2e9d3a4 --- /dev/null +++ b/library/cpp/microbdb/CMakeLists.windows-x86_64.txt @@ -0,0 +1,56 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +find_package(ZLIB REQUIRED) +get_built_tool_path( + TOOL_protoc_bin + TOOL_protoc_dependency + contrib/tools/protoc/bin + protoc +) +get_built_tool_path( + TOOL_cpp_styleguide_bin + TOOL_cpp_styleguide_dependency + contrib/tools/protoc/plugins/cpp_styleguide + cpp_styleguide +) + +add_library(library-cpp-microbdb) +target_link_libraries(library-cpp-microbdb PUBLIC + contrib-libs-cxxsupp + yutil + contrib-libs-fastlz + contrib-libs-libc_compat + contrib-libs-protobuf + contrib-libs-snappy + ZLIB::ZLIB + cpp-deprecated-fgood + cpp-on_disk-st_hash + library-cpp-packedtypes +) +target_proto_messages(library-cpp-microbdb PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/noextinfo.proto +) +target_sources(library-cpp-microbdb PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/file.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/header.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/microbdb.cpp +) +target_proto_addincls(library-cpp-microbdb + ./ + ${CMAKE_SOURCE_DIR}/ + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src +) +target_proto_outs(library-cpp-microbdb + --cpp_out=${CMAKE_BINARY_DIR}/ + --cpp_styleguide_out=${CMAKE_BINARY_DIR}/ +) diff --git a/library/cpp/microbdb/align.h b/library/cpp/microbdb/align.h new file mode 100644 index 0000000000..2f8567f134 --- /dev/null +++ b/library/cpp/microbdb/align.h @@ -0,0 +1,17 @@ +#pragma once + +#include <util/system/defaults.h> + +using TDatAlign = int; + +static inline size_t DatFloor(size_t size) { + return (size - 1) & ~(sizeof(TDatAlign) - 1); +} + +static inline size_t DatCeil(size_t size) { + return DatFloor(size) + sizeof(TDatAlign); +} + +static inline void DatSet(void* ptr, size_t size) { + *(TDatAlign*)((char*)ptr + DatFloor(size)) = 0; +} diff --git a/library/cpp/microbdb/compressed.h b/library/cpp/microbdb/compressed.h new file mode 100644 index 0000000000..f0c9edfa92 --- /dev/null +++ b/library/cpp/microbdb/compressed.h @@ -0,0 +1,520 @@ +#pragma once + +#include <util/stream/zlib.h> + +#include "microbdb.h" +#include "safeopen.h" + +class TCompressedInputFileManip: public TInputFileManip { +public: + inline i64 GetLength() const { + return -1; // Some microbdb logic rely on unknown size of compressed files + } + + inline i64 Seek(i64 offset, int whence) { + i64 oldPos = DoGetPosition(); + i64 newPos = offset; + switch (whence) { + case SEEK_CUR: + newPos += oldPos; + [[fallthrough]]; // Complier happy. Please fix it! + case SEEK_SET: + break; + default: + return -1L; + } + if (oldPos > newPos) { + VerifyRandomAccess(); + DoSeek(0, SEEK_SET, IsStreamOpen()); + oldPos = 0; + } + const size_t bufsize = 1 << 12; + char buf[bufsize]; + for (i64 i = oldPos; i < newPos; i += bufsize) + InputStream->Read(buf, (i + (i64)bufsize < newPos) ? bufsize : (size_t)(newPos - i)); + return newPos; + } + + i64 RealSeek(i64 offset, int whence) { + InputStream.Destroy(); + i64 ret = DoSeek(offset, whence, !!CompressedInput); + if (ret != -1) + DoStreamOpen(DoCreateStream(), true); + return ret; + } + +protected: + IInputStream* CreateStream(const TFile& file) override { + CompressedInput.Reset(new TUnbufferedFileInput(file)); + return DoCreateStream(); + } + inline IInputStream* DoCreateStream() { + return new TZLibDecompress(CompressedInput.Get(), ZLib::GZip); + //return new TLzqDecompress(CompressedInput.Get()); + } + THolder<IInputStream> CompressedInput; +}; + +class TCompressedBufferedInputFileManip: public TCompressedInputFileManip { +protected: + IInputStream* CreateStream(const TFile& file) override { + CompressedInput.Reset(new TFileInput(file, 0x100000)); + return DoCreateStream(); + } +}; + +using TCompressedInputPageFile = TInputPageFileImpl<TCompressedInputFileManip>; +using TCompressedBufferedInputPageFile = TInputPageFileImpl<TCompressedBufferedInputFileManip>; + +template <class TVal> +struct TGzKey { + ui64 Offset; + TVal Key; + + static const ui32 RecordSig = TVal::RecordSig + 0x50495a47; + + TGzKey() { + } + + TGzKey(ui64 offset, const TVal& key) + : Offset(offset) + , Key(key) + { + } + + size_t SizeOf() const { + if (this) + return sizeof(Offset) + ::SizeOf(&Key); + else { + size_t sizeOfKey = ::SizeOf((TVal*)NULL); + return sizeOfKey ? (sizeof(Offset) + sizeOfKey) : 0; + } + } +}; + +template <class TVal> +class TInZIndexFile: protected TInDatFileImpl<TGzKey<TVal>> { + typedef TInDatFileImpl<TGzKey<TVal>> TDatFile; + typedef TGzKey<TVal> TGzVal; + typedef typename TDatFile::TRecIter TRecIter; + typedef typename TRecIter::TPageIter TPageIter; + +public: + TInZIndexFile() + : Index0(nullptr) + { + } + + int Open(const char* fname, size_t pages = 1, int pagesOrBytes = 1, ui32* gotRecordSig = nullptr) { + int ret = TDatFile::Open(fname, pages, pagesOrBytes, gotRecordSig); + if (ret) + return ret; + if (!(Index0 = (TDatPage*)malloc(TPageIter::GetPageSize()))) { + TDatFile::Close(); + return MBDB_NO_MEMORY; + } + if (SizeOf((TGzVal*)NULL)) + RecsOnPage = (TPageIter::GetPageSize() - sizeof(TDatPage)) / DatCeil(SizeOf((TGzVal*)NULL)); + TDatFile::Next(); + memcpy(Index0, TPageIter::Current(), TPageIter::GetPageSize()); + return 0; + } + + int Close() { + free(Index0); + Index0 = NULL; + return TDatFile::Close(); + } + + inline int GetError() const { + return TDatFile::GetError(); + } + + int FindKey(const TVal* akey, const typename TExtInfoType<TVal>::TResult* = NULL) { + assert(IsOpen()); + if (!SizeOf((TVal*)NULL)) + return FindVszKey(akey); + int pageno; + i64 offset; + FindKeyOnPage(pageno, offset, Index0, akey); + TDatPage* page = TPageIter::GotoPage(pageno + 1); + int num_add = (int)offset; + FindKeyOnPage(pageno, offset, page, akey); + return pageno + num_add; + } + + using TDatFile::IsOpen; + + int FindVszKey(const TVal* akey, const typename TExtInfoType<TVal>::TResult* = NULL) { + int pageno; + i64 offset; + FindVszKeyOnPage(pageno, offset, Index0, akey); + TDatPage* page = TPageIter::GotoPage(pageno + 1); + int num_add = (int)offset; + FindVszKeyOnPage(pageno, offset, page, akey); + return pageno + num_add; + } + + i64 FindPage(int pageno) { + if (!SizeOf((TVal*)NULL)) + return FindVszPage(pageno); + int recsize = DatCeil(SizeOf((TGzVal*)NULL)); + TDatPage* page = TPageIter::GotoPage(1 + pageno / RecsOnPage); + if (!page) // can happen if pageno is beyond EOF + return -1; + unsigned int localpageno = pageno % RecsOnPage; + if (localpageno >= page->RecNum) // can happen if pageno is beyond EOF + return -1; + TGzVal* v = (TGzVal*)((char*)page + sizeof(TDatPage) + localpageno * recsize); + return v->Offset; + } + + i64 FindVszPage(int pageno) { + TGzVal* cur = (TGzVal*)((char*)Index0 + sizeof(TDatPage)); + TGzVal* prev = cur; + unsigned int n = 0; + while (n < Index0->RecNum && cur->Offset <= (unsigned int)pageno) { + prev = cur; + cur = (TGzVal*)((char*)cur + DatCeil(SizeOf(cur))); + n++; + } + TDatPage* page = TPageIter::GotoPage(n); + unsigned int num_add = (unsigned int)(prev->Offset); + n = 0; + cur = (TGzVal*)((char*)page + sizeof(TDatPage)); + while (n < page->RecNum && n + num_add < (unsigned int)pageno) { + cur = (TGzVal*)((char*)cur + DatCeil(SizeOf(cur))); + n++; + } + if (n == page->RecNum) // can happen if pageno is beyond EOF + return -1; + return cur->Offset; + } + +protected: + void FindKeyOnPage(int& pageno, i64& offset, TDatPage* page, const TVal* Key) { + int left = 0; + int right = page->RecNum - 1; + int recsize = DatCeil(SizeOf((TGzVal*)NULL)); + while (left < right) { + int middle = (left + right) >> 1; + if (((TGzVal*)((char*)page + sizeof(TDatPage) + middle * recsize))->Key < *Key) + left = middle + 1; + else + right = middle; + } + //borders check (left and right) + pageno = (left == 0 || ((TGzVal*)((char*)page + sizeof(TDatPage) + left * recsize))->Key < *Key) ? left : left - 1; + offset = ((TGzVal*)((char*)page + sizeof(TDatPage) + pageno * recsize))->Offset; + } + + void FindVszKeyOnPage(int& pageno, i64& offset, TDatPage* page, const TVal* key) { + TGzVal* cur = (TGzVal*)((char*)page + sizeof(TDatPage)); + ui32 RecordSig = page->RecNum; + i64 tmpoffset = cur->Offset; + for (; RecordSig > 0 && cur->Key < *key; --RecordSig) { + tmpoffset = cur->Offset; + cur = (TGzVal*)((char*)cur + DatCeil(SizeOf(cur))); + } + int idx = page->RecNum - RecordSig - 1; + pageno = (idx >= 0) ? idx : 0; + offset = tmpoffset; + } + + TDatPage* Index0; + int RecsOnPage; +}; + +template <class TKey> +class TCompressedIndexedInputPageFile: public TCompressedInputPageFile { +public: + int GotoPage(int pageno); + +protected: + TInZIndexFile<TKey> KeyFile; +}; + +template <class TVal, class TKey> +class TDirectCompressedInDatFile: public TDirectInDatFile<TVal, TKey, + TInDatFileImpl<TVal, TInputRecordIterator<TVal, + TInputPageIterator<TCompressedIndexedInputPageFile<TKey>>>>> { +}; + +class TCompressedOutputFileManip: public TOutputFileManip { +public: + inline i64 GetLength() const { + return -1; // Some microbdb logic rely on unknown size of compressed files + } + + inline i64 Seek(i64 offset, int whence) { + i64 oldPos = DoGetPosition(); + i64 newPos = offset; + switch (whence) { + case SEEK_CUR: + newPos += oldPos; + [[fallthrough]]; // Compler happy. Please fix it! + case SEEK_SET: + break; + default: + return -1L; + } + if (oldPos > newPos) + return -1L; + + const size_t bufsize = 1 << 12; + char buf[bufsize] = {0}; + for (i64 i = oldPos; i < newPos; i += bufsize) + OutputStream->Write(buf, (i + (i64)bufsize < newPos) ? bufsize : (size_t)(newPos - i)); + return newPos; + } + + i64 RealSeek(i64 offset, int whence) { + OutputStream.Destroy(); + i64 ret = DoSeek(offset, whence, !!CompressedOutput); + if (ret != -1) + DoStreamOpen(DoCreateStream(), true); + return ret; + } + +protected: + IOutputStream* CreateStream(const TFile& file) override { + CompressedOutput.Reset(new TUnbufferedFileOutput(file)); + return DoCreateStream(); + } + inline IOutputStream* DoCreateStream() { + return new TZLibCompress(CompressedOutput.Get(), ZLib::GZip, 1); + } + THolder<IOutputStream> CompressedOutput; +}; + +class TCompressedBufferedOutputFileManip: public TCompressedOutputFileManip { +protected: + IOutputStream* CreateStream(const TFile& file) override { + CompressedOutput.Reset(new TUnbufferedFileOutput(file)); + return DoCreateStream(); + } + inline IOutputStream* DoCreateStream() { + return new TZLibCompress(CompressedOutput.Get(), ZLib::GZip, 1, 0x100000); + } +}; + +using TCompressedOutputPageFile = TOutputPageFileImpl<TCompressedOutputFileManip>; +using TCompressedBufferedOutputPageFile = TOutputPageFileImpl<TCompressedBufferedOutputFileManip>; + +template <class TVal> +class TOutZIndexFile: public TOutDatFileImpl< + TGzKey<TVal>, + TOutputRecordIterator<TGzKey<TVal>, TOutputPageIterator<TOutputPageFile>, TCallbackIndexer>> { + typedef TOutDatFileImpl< + TGzKey<TVal>, + TOutputRecordIterator<TGzKey<TVal>, TOutputPageIterator<TOutputPageFile>, TCallbackIndexer>> + TDatFile; + typedef TOutZIndexFile<TVal> TMyType; + typedef TGzKey<TVal> TGzVal; + typedef typename TDatFile::TRecIter TRecIter; + typedef typename TRecIter::TPageIter TPageIter; + typedef typename TRecIter::TIndexer TIndexer; + +public: + TOutZIndexFile() { + TotalRecNum = 0; + TIndexer::SetCallback(this, DispatchCallback); + } + + int Open(const char* fname, size_t pagesize, size_t pages, int pagesOrBytes = 1) { + int ret = TDatFile::Open(fname, pagesize, pages, pagesOrBytes); + if (ret) + return ret; + if ((ret = TRecIter::GotoPage(1))) + TDatFile::Close(); + return ret; + } + + int Close() { + TPageIter::Unfreeze(); + if (TRecIter::RecNum) + NextPage(TPageIter::Current()); + int ret = 0; + if (Index0.size() && !(ret = TRecIter::GotoPage(0))) { + typename std::vector<TGzVal>::iterator it, end = Index0.end(); + for (it = Index0.begin(); it != end; ++it) + TRecIter::Push(&*it); + ret = (TPageIter::GetPageNum() != 0) ? MBDB_PAGE_OVERFLOW : TPageIter::GetError(); + } + Index0.clear(); + int ret1 = TDatFile::Close(); + return ret ? ret : ret1; + } + +protected: + int TotalRecNum; // should be enough because we have GotoPage(int) + std::vector<TGzVal> Index0; + + void NextPage(const TDatPage* page) { + TGzVal* rec = (TGzVal*)((char*)page + sizeof(TDatPage)); + Index0.push_back(TGzVal(TotalRecNum, rec->Key)); + TotalRecNum += TRecIter::RecNum; + } + + static void DispatchCallback(void* This, const TDatPage* page) { + ((TMyType*)This)->NextPage(page); + } +}; + +template <class TVal, class TKey, class TPageFile = TCompressedOutputPageFile> +class TOutDirectCompressedFileImpl: public TOutDatFileImpl< + TVal, + TOutputRecordIterator<TVal, TOutputPageIterator<TPageFile>, TCallbackIndexer>> { + typedef TOutDatFileImpl< + TVal, + TOutputRecordIterator<TVal, TOutputPageIterator<TPageFile>, TCallbackIndexer>> + TDatFile; + typedef TOutDirectCompressedFileImpl<TVal, TKey, TPageFile> TMyType; + typedef typename TDatFile::TRecIter TRecIter; + typedef typename TRecIter::TPageIter TPageIter; + typedef typename TRecIter::TIndexer TIndexer; + typedef TGzKey<TKey> TMyKey; + typedef TOutZIndexFile<TKey> TKeyFile; + +protected: + using TDatFile::Tell; + +public: + TOutDirectCompressedFileImpl() { + TIndexer::SetCallback(this, DispatchCallback); + } + + int Open(const char* fname, size_t pagesize, size_t ipagesize = 0) { + char iname[FILENAME_MAX]; + int ret; + if (ipagesize == 0) + ipagesize = pagesize; + + ret = TDatFile::Open(fname, pagesize, 1, 1); + ret = ret ? ret : DatNameToIdx(iname, fname); + ret = ret ? ret : KeyFile.Open(iname, ipagesize, 1, 1); + if (ret) + TDatFile::Close(); + return ret; + } + + int Close() { + if (TRecIter::RecNum) + NextPage(TPageIter::Current()); + int ret = KeyFile.Close(); + int ret1 = TDatFile::Close(); + return ret1 ? ret1 : ret; + } + + int GetError() const { + return TDatFile::GetError() ? TDatFile::GetError() : KeyFile.GetError(); + } + +protected: + TKeyFile KeyFile; + + void NextPage(const TDatPage* page) { + size_t sz = SizeOf((TMyKey*)NULL); + TMyKey* rec = KeyFile.Reserve(sz ? sz : MaxSizeOf<TMyKey>()); + if (rec) { + rec->Offset = Tell(); + rec->Key = *(TVal*)((char*)page + sizeof(TDatPage)); + KeyFile.ResetDat(); + } + } + + static void DispatchCallback(void* This, const TDatPage* page) { + ((TMyType*)This)->NextPage(page); + } +}; + +template <class TKey> +int TCompressedIndexedInputPageFile<TKey>::GotoPage(int pageno) { + if (Error) + return Error; + + Eof = 0; + + i64 offset = KeyFile.FindPage(pageno); + if (!offset) + return Error = MBDB_BAD_FILE_SIZE; + + if (offset != FileManip.RealSeek(offset, SEEK_SET)) + Error = MBDB_BAD_FILE_SIZE; + + return Error; +} + +template <typename TVal> +class TCompressedInDatFile: public TInDatFile<TVal, TCompressedInputPageFile> { +public: + TCompressedInDatFile(const char* name, size_t pages, int pagesOrBytes = 1) + : TInDatFile<TVal, TCompressedInputPageFile>(name, pages, pagesOrBytes) + { + } +}; + +template <typename TVal> +class TCompressedOutDatFile: public TOutDatFile<TVal, TFakeCompression, TCompressedOutputPageFile> { +public: + TCompressedOutDatFile(const char* name, size_t pagesize, size_t pages, int pagesOrBytes = 1) + : TOutDatFile<TVal, TFakeCompression, TCompressedOutputPageFile>(name, pagesize, pages, pagesOrBytes) + { + } +}; + +template <typename TVal, typename TKey, typename TPageFile = TCompressedOutputPageFile> +class TOutDirectCompressedFile: protected TOutDirectCompressedFileImpl<TVal, TKey, TPageFile> { + typedef TOutDirectCompressedFileImpl<TVal, TKey, TPageFile> TBase; + +public: + TOutDirectCompressedFile(const char* name, size_t pagesize, size_t ipagesize = 0) + : Name(strdup(name)) + , PageSize(pagesize) + , IdxPageSize(ipagesize) + { + } + + ~TOutDirectCompressedFile() { + Close(); + free(Name); + Name = NULL; + } + + void Open(const char* fname) { + int ret = TBase::Open(fname, PageSize, IdxPageSize); + if (ret) + ythrow yexception() << ErrorMessage(ret, "Failed to open output file", fname); + free(Name); + Name = strdup(fname); + } + + void Close() { + int ret; + if ((ret = TBase::GetError())) + if (!std::uncaught_exception()) + ythrow yexception() << ErrorMessage(ret, "Error before closing output file", Name); + if ((ret = TBase::Close())) + if (!std::uncaught_exception()) + ythrow yexception() << ErrorMessage(ret, "Error while closing output file", Name); + } + + const char* GetName() const { + return Name; + } + + using TBase::Freeze; + using TBase::Push; + using TBase::Reserve; + using TBase::Unfreeze; + +protected: + char* Name; + size_t PageSize, IdxPageSize; +}; + +class TCompressedInterFileTypes { +public: + typedef TCompressedBufferedOutputPageFile TOutPageFile; + typedef TCompressedBufferedInputPageFile TInPageFile; +}; diff --git a/library/cpp/microbdb/extinfo.h b/library/cpp/microbdb/extinfo.h new file mode 100644 index 0000000000..c8389e783c --- /dev/null +++ b/library/cpp/microbdb/extinfo.h @@ -0,0 +1,127 @@ +#pragma once + +#include "header.h" + +#include <library/cpp/packedtypes/longs.h> + +#include <util/generic/typetraits.h> + +#include <library/cpp/microbdb/noextinfo.pb.h> + +inline bool operator<(const TNoExtInfo&, const TNoExtInfo&) { + return false; +} + +namespace NMicroBDB { + Y_HAS_MEMBER(TExtInfo); + + template <class, bool> + struct TSelectExtInfo; + + template <class T> + struct TSelectExtInfo<T, false> { + typedef TNoExtInfo TExtInfo; + }; + + template <class T> + struct TSelectExtInfo<T, true> { + typedef typename T::TExtInfo TExtInfo; + }; + + template <class T> + class TExtInfoType { + public: + static const bool Exists = THasTExtInfo<T>::value; + typedef typename TSelectExtInfo<T, Exists>::TExtInfo TResult; + }; + + Y_HAS_MEMBER(MakeExtKey); + + template <class, class, bool> + struct TSelectMakeExtKey; + + template <class TVal, class TKey> + struct TSelectMakeExtKey<TVal, TKey, false> { + static inline void Make(TKey* to, typename TExtInfoType<TKey>::TResult*, const TVal* from, const typename TExtInfoType<TVal>::TResult*) { + *to = *from; + } + }; + + template <class TVal, class TKey> + struct TSelectMakeExtKey<TVal, TKey, true> { + static inline void Make(TKey* to, typename TExtInfoType<TKey>::TResult* toExt, const TVal* from, const typename TExtInfoType<TVal>::TResult* fromExt) { + TVal::MakeExtKey(to, toExt, from, fromExt); + } + }; + + template <typename T> + inline size_t SizeOfExt(const T* rec, size_t* /*out*/ extLenSize = nullptr, size_t* /*out*/ extSize = nullptr) { + if (!TExtInfoType<T>::Exists) { + if (extLenSize) + *extLenSize = 0; + if (extSize) + *extSize = 0; + return SizeOf(rec); + } else { + size_t sz = SizeOf(rec); + i64 l; + int els = in_long(l, (const char*)rec + sz); + if (extLenSize) + *extLenSize = static_cast<size_t>(els); + if (extSize) + *extSize = static_cast<size_t>(l); + return sz; + } + } + + template <class T> + bool GetExtInfo(const T* rec, typename TExtInfoType<T>::TResult* extInfo) { + Y_VERIFY(TExtInfoType<T>::Exists, "GetExtInfo should only be used with extended records"); + if (!rec) + return false; + size_t els; + size_t es; + size_t s = SizeOfExt(rec, &els, &es); + const ui8* raw = (const ui8*)rec + s + els; + return extInfo->ParseFromArray(raw, es); + } + + template <class T> + const ui8* GetExtInfoRaw(const T* rec, size_t* len) { + Y_VERIFY(TExtInfoType<T>::Exists, "GetExtInfo should only be used with extended records"); + if (!rec) { + *len = 0; + return nullptr; + } + size_t els; + size_t es; + size_t s = SizeOfExt(rec, &els, &es); + *len = els + es; + return (const ui8*)rec + s; + } + + // Compares serialized extInfo (e.g. for stable sort) + template <class T> + int CompareExtInfo(const T* a, const T* b) { + Y_VERIFY(TExtInfoType<T>::Exists, "CompareExtInfo should only be used with extended records"); + size_t elsA, esA; + size_t elsB, esB; + SizeOfExt(a, &elsA, &esA); + SizeOfExt(a, &elsB, &esB); + if (esA != esB) + return esA - esB; + else + return memcmp((const ui8*)a + elsA, (const ui8*)b + elsB, esA); + } + +} + +using NMicroBDB::TExtInfoType; + +template <class TVal, class TKey> +struct TMakeExtKey { + static const bool Exists = NMicroBDB::THasMakeExtKey<TVal>::value; + static inline void Make(TKey* to, typename TExtInfoType<TKey>::TResult* toExt, const TVal* from, const typename TExtInfoType<TVal>::TResult* fromExt) { + NMicroBDB::TSelectMakeExtKey<TVal, TKey, Exists>::Make(to, toExt, from, fromExt); + } +}; diff --git a/library/cpp/microbdb/file.cpp b/library/cpp/microbdb/file.cpp new file mode 100644 index 0000000000..599a7301a0 --- /dev/null +++ b/library/cpp/microbdb/file.cpp @@ -0,0 +1,220 @@ +#include "file.h" + +#include <fcntl.h> +#include <errno.h> +#include <sys/stat.h> + +#ifdef _win32_ +#define S_ISREG(x) !!(x & S_IFREG) +#endif + +TFileManipBase::TFileManipBase() + : FileBased(true) +{ +} + +i64 TFileManipBase::DoSeek(i64 offset, int whence, bool isStreamOpen) { + if (!isStreamOpen) + return -1; + VerifyRandomAccess(); + return File.Seek(offset, (SeekDir)whence); +} + +int TFileManipBase::DoFileOpen(const TFile& file) { + File = file; + SetFileBased(IsFileBased()); + return (File.IsOpen()) ? 0 : MBDB_OPEN_ERROR; +} + +int TFileManipBase::DoFileClose() { + if (File.IsOpen()) { + File.Close(); + return MBDB_ALREADY_INITIALIZED; + } + return 0; +} + +int TFileManipBase::IsFileBased() const { + bool fileBased = true; +#if defined(_win_) +#elif defined(_unix_) + FHANDLE h = File.GetHandle(); + struct stat sb; + fileBased = false; + if (h != INVALID_FHANDLE && !::fstat(h, &sb) && S_ISREG(sb.st_mode)) { + fileBased = true; + } +#else +#error +#endif + return fileBased; +} + +TInputFileManip::TInputFileManip() + : InputStream(nullptr) +{ +} + +int TInputFileManip::Open(const char* fname, bool direct) { + int ret; + return (ret = DoClose()) ? ret : DoStreamOpen(TFile(fname, RdOnly | (direct ? DirectAligned : EOpenMode()))); +} + +int TInputFileManip::Open(IInputStream& input) { + int ret; + return (ret = DoClose()) ? ret : DoStreamOpen(&input); +} + +int TInputFileManip::Open(TAutoPtr<IInputStream> input) { + int ret; + return (ret = DoClose()) ? ret : DoStreamOpen(input.Release()); +} + +int TInputFileManip::Init(const TFile& file) { + int ret; + if (ret = DoClose()) + return ret; + DoStreamOpen(file); + return 0; +} + +int TInputFileManip::Close() { + DoClose(); + return 0; +} + +ssize_t TInputFileManip::Read(void* buf, unsigned len) { + if (!IsStreamOpen()) + return -1; + return InputStream->Load(buf, len); +} + +IInputStream* TInputFileManip::CreateStream(const TFile& file) { + return new TUnbufferedFileInput(file); +} + +TMappedInputPageFile::TMappedInputPageFile() + : Pagesize(0) + , Error(0) + , Pagenum(0) + , Recordsig(0) + , Open(false) +{ + Term(); +} + +TMappedInputPageFile::~TMappedInputPageFile() { + Term(); +} + +int TMappedInputPageFile::Init(const char* fname, ui32 recsig, ui32* gotRecordSig, bool) { + Mappedfile.init(fname); + Open = true; + + TDatMetaPage* meta = (TDatMetaPage*)Mappedfile.getData(); + if (gotRecordSig) + *gotRecordSig = meta->RecordSig; + + if (meta->MetaSig != METASIG) + Error = MBDB_BAD_METAPAGE; + else if (meta->RecordSig != recsig) + Error = MBDB_BAD_RECORDSIG; + + if (Error) { + Mappedfile.term(); + return Error; + } + + size_t fsize = Mappedfile.getSize(); + if (fsize < METASIZE) + return Error = MBDB_BAD_FILE_SIZE; + fsize -= METASIZE; + if (fsize % meta->PageSize) + return Error = MBDB_BAD_FILE_SIZE; + Pagenum = (int)(fsize / meta->PageSize); + Pagesize = meta->PageSize; + Recordsig = meta->RecordSig; + Error = 0; + return Error; +} + +int TMappedInputPageFile::Term() { + Mappedfile.term(); + Open = false; + return 0; +} + +TOutputFileManip::TOutputFileManip() + : OutputStream(nullptr) +{ +} + +int TOutputFileManip::Open(const char* fname, EOpenMode mode) { + if (IsStreamOpen()) { + return MBDB_ALREADY_INITIALIZED; // should it be closed as TInputFileManip + } + + try { + if (unlink(fname) && errno != ENOENT) { + if (strncmp(fname, "/dev/std", 8)) + return MBDB_OPEN_ERROR; + } + TFile file(fname, mode); + DoStreamOpen(file); + } catch (const TFileError&) { + return MBDB_OPEN_ERROR; + } + return 0; +} + +int TOutputFileManip::Open(IOutputStream& output) { + if (IsStreamOpen()) + return MBDB_ALREADY_INITIALIZED; + DoStreamOpen(&output); + return 0; +} + +int TOutputFileManip::Open(TAutoPtr<IOutputStream> output) { + if (IsStreamOpen()) + return MBDB_ALREADY_INITIALIZED; + DoStreamOpen(output.Release()); + return 0; +} + +int TOutputFileManip::Init(const TFile& file) { + if (IsStreamOpen()) + return MBDB_ALREADY_INITIALIZED; // should it be closed as TInputFileManip + DoStreamOpen(file); + return 0; +} + +int TOutputFileManip::Rotate(const char* newfname) { + if (!IsStreamOpen()) { + return MBDB_NOT_INITIALIZED; + } + + try { + TFile file(newfname, WrOnly | OpenAlways | TruncExisting | ARW | AWOther); + DoClose(); + DoStreamOpen(file); + } catch (const TFileError&) { + return MBDB_OPEN_ERROR; + } + return 0; +} + +int TOutputFileManip::Close() { + DoClose(); + return 0; +} + +int TOutputFileManip::Write(const void* buf, unsigned len) { + if (!IsStreamOpen()) + return -1; + OutputStream->Write(buf, len); + return len; +} + +IOutputStream* TOutputFileManip::CreateStream(const TFile& file) { + return new TUnbufferedFileOutput(file); +} diff --git a/library/cpp/microbdb/file.h b/library/cpp/microbdb/file.h new file mode 100644 index 0000000000..f7c7818375 --- /dev/null +++ b/library/cpp/microbdb/file.h @@ -0,0 +1,225 @@ +#pragma once + +#include "header.h" + +#include <library/cpp/deprecated/mapped_file/mapped_file.h> + +#include <util/generic/noncopyable.h> +#include <util/stream/file.h> +#include <util/system/filemap.h> + +#define FS_BLOCK_SIZE 512 + +class TFileManipBase { +protected: + TFileManipBase(); + + virtual ~TFileManipBase() { + } + + i64 DoSeek(i64 offset, int whence, bool isStreamOpen); + + int DoFileOpen(const TFile& file); + + int DoFileClose(); + + int IsFileBased() const; + + inline void SetFileBased(bool fileBased) { + FileBased = fileBased; + } + + inline i64 DoGetPosition() const { + Y_ASSERT(FileBased); + return File.GetPosition(); + } + + inline i64 DoGetLength() const { + return (FileBased) ? File.GetLength() : -1; + } + + inline void VerifyRandomAccess() const { + Y_VERIFY(FileBased, "non-file stream can not be accessed randomly"); + } + + inline i64 GetPosition() const { + return (i64)File.GetPosition(); + } + +private: + TFile File; + bool FileBased; +}; + +class TInputFileManip: public TFileManipBase { +public: + using TFileManipBase::GetPosition; + + TInputFileManip(); + + int Open(const char* fname, bool direct = false); + + int Open(IInputStream& input); + + int Open(TAutoPtr<IInputStream> input); + + int Init(const TFile& file); + + int Close(); + + ssize_t Read(void* buf, unsigned len); + + inline bool IsOpen() const { + return IsStreamOpen(); + } + + inline i64 GetLength() const { + return DoGetLength(); + } + + inline i64 Seek(i64 offset, int whence) { + return DoSeek(offset, whence, IsStreamOpen()); + } + + inline i64 RealSeek(i64 offset, int whence) { + return Seek(offset, whence); + } + +protected: + inline bool IsStreamOpen() const { + return !!InputStream; + } + + inline int DoStreamOpen(IInputStream* input, bool fileBased = false) { + InputStream.Reset(input); + SetFileBased(fileBased); + return 0; + } + + inline int DoStreamOpen(const TFile& file) { + int ret; + return (ret = DoFileOpen(file)) ? ret : DoStreamOpen(CreateStream(file), IsFileBased()); + } + + virtual IInputStream* CreateStream(const TFile& file); + + inline bool DoClose() { + if (IsStreamOpen()) { + InputStream.Destroy(); + return DoFileClose(); + } + return 0; + } + + THolder<IInputStream> InputStream; +}; + +class TMappedInputPageFile: private TNonCopyable { +public: + TMappedInputPageFile(); + + ~TMappedInputPageFile(); + + inline int GetError() const { + return Error; + } + + inline size_t GetPageSize() const { + return Pagesize; + } + + inline int GetLastPage() const { + return Pagenum; + } + + inline ui32 GetRecordSig() const { + return Recordsig; + } + + inline bool IsOpen() const { + return Open; + } + + inline char* GetData() const { + return Open ? (char*)Mappedfile.getData() : nullptr; + } + + inline size_t GetSize() const { + return Open ? Mappedfile.getSize() : 0; + } + +protected: + int Init(const char* fname, ui32 recsig, ui32* gotRecordSig = nullptr, bool direct = false); + + int Term(); + + TMappedFile Mappedfile; + size_t Pagesize; + int Error; + int Pagenum; + ui32 Recordsig; + bool Open; +}; + +class TOutputFileManip: public TFileManipBase { +public: + TOutputFileManip(); + + int Open(const char* fname, EOpenMode mode = WrOnly | CreateAlways | ARW | AWOther); + + int Open(IOutputStream& output); + + int Open(TAutoPtr<IOutputStream> output); + + int Init(const TFile& file); + + int Rotate(const char* newfname); + + int Write(const void* buf, unsigned len); + + int Close(); + + inline bool IsOpen() const { + return IsStreamOpen(); + } + + inline i64 GetLength() const { + return DoGetLength(); + } + + inline i64 Seek(i64 offset, int whence) { + return DoSeek(offset, whence, IsStreamOpen()); + } + + inline i64 RealSeek(i64 offset, int whence) { + return Seek(offset, whence); + } + +protected: + inline bool IsStreamOpen() const { + return !!OutputStream; + } + + inline int DoStreamOpen(IOutputStream* output, bool fileBased = false) { + OutputStream.Reset(output); + SetFileBased(fileBased); + return 0; + } + + inline int DoStreamOpen(const TFile& file) { + int ret; + return (ret = DoFileOpen(file)) ? ret : DoStreamOpen(CreateStream(file), true); + } + + virtual IOutputStream* CreateStream(const TFile& file); + + inline bool DoClose() { + if (IsStreamOpen()) { + OutputStream.Destroy(); + return DoFileClose(); + } + return 0; + } + + THolder<IOutputStream> OutputStream; +}; diff --git a/library/cpp/microbdb/hashes.h b/library/cpp/microbdb/hashes.h new file mode 100644 index 0000000000..bfd113c3ba --- /dev/null +++ b/library/cpp/microbdb/hashes.h @@ -0,0 +1,250 @@ +#pragma once + +#include <library/cpp/on_disk/st_hash/static_hash.h> +#include <util/system/sysstat.h> +#include <util/stream/mem.h> +#include <util/string/printf.h> +#include <library/cpp/deprecated/fgood/fgood.h> + +#include "safeopen.h" + +/** This file currently implements creation of mappable read-only hash file. + Basic usage of these "static hashes" is defined in util/static_hash.h (see docs there). + Additional useful wrappers are available in util/static_hash_map.h + + There are two ways to create mappable hash file: + + A) Fill an THashMap/set structure in RAM, then dump it to disk. + This is usually done by save_hash_to_file* functions defined in static_hash.h + (see description in static_hash.h). + + B) Prepare all data using external sorter, then create hash file straight on disk. + This approach is necessary when there isn't enough RAM to hold entire original THashMap. + Implemented in this file as TStaticHashBuilder class. + + Current implementation's major drawback is that the size of the hash must be estimated + before the hash is built (bucketCount), which is not always possible. + Separate implementation with two sort passes is yet to be done. + + Another problem is that maximum stored size of the element (maxRecSize) must also be + known in advance, because we use TDatSorterMemo, etc. + */ + +template <class SizeType> +struct TSthashTmpRec { + SizeType HashVal; + SizeType RecSize; + char Buf[1]; + size_t SizeOf() const { + return &Buf[RecSize] - (char*)this; + } + bool operator<(const TSthashTmpRec& than) const { + return HashVal < than.HashVal; + } + static const ui32 RecordSig = 20100124 + sizeof(SizeType) - 4; +}; + +template <typename T> +struct TReplaceMerger { + T operator()(const T& oldRecord, const T& newRecord) const { + Y_UNUSED(oldRecord); + return newRecord; + } +}; + +/** TStaticHashBuilder template parameters: + HashType - THashMap map/set type for which we construct corresponding mappable hash; + SizeType - type used to store offsets and length in resulting hash; + MergerType - type of object to process records with equal key (see TReplaceMerger for example); + */ + +template <class HashType, class SizeType, class MergerType = TReplaceMerger<typename HashType::mapped_type>> +struct TStaticHashBuilder { + const size_t SrtIOPageSz; + const size_t WrBufSz; + typedef TSthashTmpRec<SizeType> TIoRec; + typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, SizeType> TKeySaver; + typedef typename HashType::value_type TValueType; + typedef typename HashType::mapped_type TMappedType; + typedef typename HashType::key_type TKeyType; + + TDatSorterMemo<TIoRec, TCompareByLess> Srt; + TBuffer IoRec, CurrentBlockRecs; + TKeySaver KeySaver; + typename HashType::hasher Hasher; + typename HashType::key_equal Equals; + MergerType merger; + TString HashFileName; + TString OurTmpDir; + size_t BucketCount; + int FreeBits; + + // memSz is the Sorter buffer size; + // maxRecSize is the maximum size (as reported by size_for_st) of our record(s) + TStaticHashBuilder(size_t memSz, size_t maxRecSize) + : SrtIOPageSz((maxRecSize * 16 + 65535) & ~size_t(65535)) + , WrBufSz(memSz / 16 >= SrtIOPageSz ? memSz / 16 : SrtIOPageSz) + , Srt("unused", memSz, SrtIOPageSz, WrBufSz, 0) + , IoRec(sizeof(TIoRec) + maxRecSize) + , CurrentBlockRecs(sizeof(TIoRec) + maxRecSize) + , BucketCount(0) + , FreeBits(0) + { + } + + ~TStaticHashBuilder() { + Close(); + } + + // if tmpDir is supplied, it must exist; + // bucketCount should be HashBucketCount() of the (estimated) element count + void Open(const char* fname, size_t bucketCount, const char* tmpDir = nullptr) { + if (!tmpDir) + tmpDir = ~(OurTmpDir = Sprintf("%s.temp", fname)); + Mkdir(tmpDir, MODE0775); + Srt.Open(tmpDir); + HashFileName = fname; + BucketCount = bucketCount; + int bitCount = 0; + while (((size_t)1 << bitCount) <= BucketCount && bitCount < int(8 * sizeof(size_t))) + ++bitCount; + FreeBits = 8 * sizeof(size_t) - bitCount; + } + + void Push(const TValueType& rec) { + TIoRec* ioRec = MakeIoRec(rec); + Srt.Push(ioRec); + } + TIoRec* MakeIoRec(const TValueType& rec) { + TIoRec* ioRec = (TIoRec*)IoRec.Data(); + size_t mask = (1 << FreeBits) - 1; + size_t hash = Hasher(rec.first); + ioRec->HashVal = ((hash % BucketCount) << FreeBits) + ((hash / BucketCount) & mask); + + TMemoryOutput output(ioRec->Buf, IoRec.Capacity() - offsetof(TIoRec, Buf)); + KeySaver.SaveRecord(&output, rec); + ioRec->RecSize = output.Buf() - ioRec->Buf; + return ioRec; + } + + bool Merge(TVector<std::pair<TKeyType, TMappedType>>& records, size_t newRecordSize) { + TSthashIterator<const TKeyType, const TMappedType, typename HashType::hasher, + typename HashType::key_equal> + newPtr(CurrentBlockRecs.End() - newRecordSize); + for (size_t i = 0; i < records.size(); ++i) { + if (newPtr.KeyEquals(Equals, records[i].first)) { + TMappedType oldValue = records[i].second; + TMappedType newValue = newPtr.Value(); + newValue = merger(oldValue, newValue); + records[i].second = newValue; + return true; + } + } + records.push_back(std::make_pair(newPtr.Key(), newPtr.Value())); + return false; + } + + void PutRecord(const char* buf, size_t rec_size, TFILEPtr& f, SizeType& cur_off) { + f.fsput(buf, rec_size); + cur_off += rec_size; + } + + void Finish() { + Srt.Sort(); + // We use variant 1. + // Variant 1: read sorter once, write records, fseeks to write buckets + // (this doesn't allow fname to be stdout) + // Variant 2: read sorter (probably temp. file) twice: write buckets, then write records + // (this allows fname to be stdout but seems to be longer) + TFILEPtr f(HashFileName, "wb"); + setvbuf(f, nullptr, _IOFBF, WrBufSz); + TVector<SizeType> bucketsBuf(WrBufSz, 0); + // prepare header (note: this code must be unified with save_stl.h) + typedef sthashtable_nvm_sv<typename HashType::hasher, typename HashType::key_equal, SizeType> sv_type; + sv_type sv = {Hasher, Equals, BucketCount, 0, 0}; + // to do: m.b. use just the size of corresponding object? + SizeType cur_off = sizeof(sv_type) + + (sv.num_buckets + 1) * sizeof(SizeType); + SizeType bkt_wroff = sizeof(sv_type), bkt_bufpos = 0, prev_bkt = 0, prev_hash = (SizeType)-1; + bucketsBuf[bkt_bufpos++] = cur_off; + // if might me better to write many zeroes here + f.seek(cur_off, SEEK_SET); + TVector<std::pair<TKeyType, TMappedType>> currentBlock; + bool emptyFile = true; + size_t prevRecSize = 0; + // seek forward + while (true) { + const TIoRec* rec = Srt.Next(); + if (currentBlock.empty() && !emptyFile) { + if (rec && prev_hash == rec->HashVal) { + Merge(currentBlock, prevRecSize); + } else { + // if there is only one record with this hash, don't recode it, just write + PutRecord(CurrentBlockRecs.Data(), prevRecSize, f, cur_off); + sv.num_elements++; + } + } + if (!rec || prev_hash != rec->HashVal) { + // write buckets table + for (size_t i = 0; i < currentBlock.size(); ++i) { + TIoRec* ioRec = MakeIoRec(TValueType(currentBlock[i])); + PutRecord(ioRec->Buf, ioRec->RecSize, f, cur_off); + } + sv.num_elements += currentBlock.size(); + currentBlock.clear(); + CurrentBlockRecs.Clear(); + if (rec) { + prev_hash = rec->HashVal; + } + } + // note: prev_bkt's semantics here is 'cur_bkt - 1', thus we are actually cycling + // until cur_bkt == rec->HashVal *inclusively* + while (!rec || prev_bkt != (rec->HashVal >> FreeBits)) { + bucketsBuf[bkt_bufpos++] = cur_off; + if (bkt_bufpos == bucketsBuf.size()) { + f.seek(bkt_wroff, SEEK_SET); + size_t sz = bkt_bufpos * sizeof(bucketsBuf[0]); + if (f.write(bucketsBuf.begin(), 1, sz) != sz) + throw yexception() << "could not write " << sz << " bytes to " << ~HashFileName; + bkt_wroff += sz; + bkt_bufpos = 0; + f.seek(cur_off, SEEK_SET); + } + prev_bkt++; + if (!rec) { + break; + } + assert(prev_bkt < BucketCount); + } + if (!rec) { + break; + } + emptyFile = false; + CurrentBlockRecs.Append(rec->Buf, rec->RecSize); + if (!currentBlock.empty()) { + Merge(currentBlock, rec->RecSize); + } else { + prevRecSize = rec->RecSize; + } + } + // finish buckets table + f.seek(bkt_wroff, SEEK_SET); + size_t sz = bkt_bufpos * sizeof(bucketsBuf[0]); + if (sz && f.write(bucketsBuf.begin(), 1, sz) != sz) + throw yexception() << "could not write " << sz << " bytes to " << ~HashFileName; + bkt_wroff += sz; + for (; prev_bkt < BucketCount; prev_bkt++) + f.fput(cur_off); + // finally write header + sv.data_end_off = cur_off; + f.seek(0, SEEK_SET); + f.fput(sv); + f.close(); + } + + void Close() { + Srt.Close(); + if (+OurTmpDir) + rmdir(~OurTmpDir); + } +}; diff --git a/library/cpp/microbdb/header.cpp b/library/cpp/microbdb/header.cpp new file mode 100644 index 0000000000..f4511d6fb6 --- /dev/null +++ b/library/cpp/microbdb/header.cpp @@ -0,0 +1,91 @@ +#include "header.h" + +#include <util/stream/output.h> +#include <util/stream/format.h> + +TString ToString(EMbdbErrors error) { + TString ret; + switch (error) { + case MBDB_ALREADY_INITIALIZED: + ret = "already initialized"; + break; + case MBDB_NOT_INITIALIZED: + ret = "not initialized"; + break; + case MBDB_BAD_DESCRIPTOR: + ret = "bad descriptor"; + break; + case MBDB_OPEN_ERROR: + ret = "open error"; + break; + case MBDB_READ_ERROR: + ret = "read error"; + break; + case MBDB_WRITE_ERROR: + ret = "write error"; + break; + case MBDB_CLOSE_ERROR: + ret = "close error"; + break; + case MBDB_EXPECTED_EOF: + ret = "expected eof"; + break; + case MBDB_UNEXPECTED_EOF: + ret = "unxepected eof"; + break; + case MBDB_BAD_FILENAME: + ret = "bad filename"; + break; + case MBDB_BAD_METAPAGE: + ret = "bad metapage"; + break; + case MBDB_BAD_RECORDSIG: + ret = "bad recordsig"; + break; + case MBDB_BAD_FILE_SIZE: + ret = "bad file size"; + break; + case MBDB_BAD_PAGESIG: + ret = "bad pagesig"; + break; + case MBDB_BAD_PAGESIZE: + ret = "bad pagesize"; + break; + case MBDB_BAD_PARM: + ret = "bad parm"; + break; + case MBDB_BAD_SYNC: + ret = "bad sync"; + break; + case MBDB_PAGE_OVERFLOW: + ret = "page overflow"; + break; + case MBDB_NO_MEMORY: + ret = "no memory"; + break; + case MBDB_MEMORY_LEAK: + ret = "memory leak"; + break; + case MBDB_NOT_SUPPORTED: + ret = "not supported"; + break; + default: + ret = "unknown"; + break; + } + return ret; +} + +TString ErrorMessage(int error, const TString& text, const TString& path, ui32 recordSig, ui32 gotRecordSig) { + TStringStream str; + str << text; + if (path.size()) + str << " '" << path << "'"; + str << ": " << ToString(static_cast<EMbdbErrors>(error)); + if (recordSig && (!gotRecordSig || recordSig != gotRecordSig)) + str << ". Expected RecordSig: " << Hex(recordSig, HF_ADDX); + if (recordSig && gotRecordSig && recordSig != gotRecordSig) + str << ", got: " << Hex(gotRecordSig, HF_ADDX); + str << ". Last system error text: " << LastSystemErrorText(); + return str.Str(); +} diff --git a/library/cpp/microbdb/header.h b/library/cpp/microbdb/header.h new file mode 100644 index 0000000000..0951d610ea --- /dev/null +++ b/library/cpp/microbdb/header.h @@ -0,0 +1,159 @@ +#pragma once + +#include <util/system/defaults.h> +#include <util/generic/typetraits.h> +#include <util/generic/string.h> +#include <util/str_stl.h> + +#include <stdio.h> + +#define METASIZE (1u << 12) +#define METASIG 0x12345678u +#define PAGESIG 0x87654321u + +enum EMbdbErrors { + MBDB_ALREADY_INITIALIZED = 200, + MBDB_NOT_INITIALIZED = 201, + MBDB_BAD_DESCRIPTOR = 202, + MBDB_OPEN_ERROR = 203, + MBDB_READ_ERROR = 204, + MBDB_WRITE_ERROR = 205, + MBDB_CLOSE_ERROR = 206, + MBDB_EXPECTED_EOF = 207, + MBDB_UNEXPECTED_EOF = 208, + MBDB_BAD_FILENAME = 209, + MBDB_BAD_METAPAGE = 210, + MBDB_BAD_RECORDSIG = 211, + MBDB_BAD_FILE_SIZE = 212, + MBDB_BAD_PAGESIG = 213, + MBDB_BAD_PAGESIZE = 214, + MBDB_BAD_PARM = 215, + MBDB_BAD_SYNC = 216, + MBDB_PAGE_OVERFLOW = 217, + MBDB_NO_MEMORY = 218, + MBDB_MEMORY_LEAK = 219, + MBDB_NOT_SUPPORTED = 220 +}; + +TString ToString(EMbdbErrors error); +TString ErrorMessage(int error, const TString& text, const TString& path = TString(), ui32 recordSig = 0, ui32 gotRecordSig = 0); + +enum EPageFormat { + MBDB_FORMAT_RAW = 0, + MBDB_FORMAT_COMPRESSED = 1, + MBDB_FORMAT_NULL = 255 +}; + +enum ECompressionAlgorithm { + MBDB_COMPRESSION_ZLIB = 1, + MBDB_COMPRESSION_FASTLZ = 2, + MBDB_COMPRESSION_SNAPPY = 3 +}; + +struct TDatMetaPage { + ui32 MetaSig; + ui32 RecordSig; + ui32 PageSize; +}; + +struct TDatPage { + ui32 RecNum; //!< number of records on this page + ui32 PageSig; + ui32 Format : 2; //!< one of EPageFormat + ui32 Reserved : 30; +}; + +/// Additional page header with compression info +struct TCompressedPage { + ui32 BlockCount; + ui32 Algorithm : 4; + ui32 Version : 4; + ui32 Reserved : 24; +}; + +namespace NMicroBDB { + /// Header of compressed block + struct TCompressedHeader { + ui32 Compressed; + ui32 Original; /// original size of block + ui32 Count; /// number of records in block + ui32 Reserved; + }; + + Y_HAS_MEMBER(AssertValid); + + template <typename T, bool TVal> + struct TAssertValid { + void operator()(const T*) { + } + }; + + template <typename T> + struct TAssertValid<T, true> { + void operator()(const T* rec) { + return rec->AssertValid(); + } + }; + + template <typename T> + void AssertValid(const T* rec) { + return NMicroBDB::TAssertValid<T, NMicroBDB::THasAssertValid<T>::value>()(rec); + } + + Y_HAS_MEMBER(SizeOf); + + template <typename T, bool TVal> + struct TGetSizeOf; + + template <typename T> + struct TGetSizeOf<T, true> { + size_t operator()(const T* rec) { + return rec->SizeOf(); + } + }; + + template <typename T> + struct TGetSizeOf<T, false> { + size_t operator()(const T*) { + return sizeof(T); + } + }; + + inline char* GetFirstRecord(const TDatPage* page) { + switch (page->Format) { + case MBDB_FORMAT_RAW: + return (char*)page + sizeof(TDatPage); + case MBDB_FORMAT_COMPRESSED: + // Первая запись на сжатой странице сохраняется несжатой + // сразу же после всех заголовков. + // Алгоритм сохранения смотреть в TOutputRecordIterator::FlushBuffer + return (char*)page + sizeof(TDatPage) + sizeof(TCompressedPage) + sizeof(NMicroBDB::TCompressedHeader); + } + return (char*)nullptr; + } +} + +template <typename T> +size_t SizeOf(const T* rec) { + return NMicroBDB::TGetSizeOf<T, NMicroBDB::THasSizeOf<T>::value>()(rec); +} + +template <typename T> +size_t MaxSizeOf() { + return sizeof(T); +} + +static inline int DatNameToIdx(char iname[/*FILENAME_MAX*/], const char* dname) { + if (!dname || !*dname) + return MBDB_BAD_FILENAME; + const char* ptr; + if (!(ptr = strrchr(dname, '/'))) + ptr = dname; + if (!(ptr = strrchr(ptr, '.'))) + ptr = strchr(dname, 0); + if (ptr - dname > FILENAME_MAX - 5) + return MBDB_BAD_FILENAME; + memcpy(iname, dname, ptr - dname); + strcpy(iname + (ptr - dname), ".idx"); + return 0; +} diff --git a/library/cpp/microbdb/heap.h b/library/cpp/microbdb/heap.h new file mode 100644 index 0000000000..ef5a53534c --- /dev/null +++ b/library/cpp/microbdb/heap.h @@ -0,0 +1,143 @@ +#pragma once + +#include "header.h" +#include "extinfo.h" + +#include <util/generic/vector.h> + +#include <errno.h> + +/////////////////////////////////////////////////////////////////////////////// + +/// Default comparator +template <class TVal> +struct TCompareByLess { + inline bool operator()(const TVal* a, const TVal* b) const { + return TLess<TVal>()(*a, *b); + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +template <class TVal, class TIterator, class TCompare = TCompareByLess<TVal>> +class THeapIter { +public: + int Init(TIterator** iters, int count) { + Term(); + if (!count) + return 0; + if (!(Heap = (TIterator**)malloc(count * sizeof(TIterator*)))) + return ENOMEM; + + Count = count; + count = 0; + while (count < Count) + if (count && !(*iters)->Next()) { //here first TIterator is NOT initialized! + Count--; + iters++; + } else { + Heap[count++] = *iters++; + } + count = Count / 2; + while (--count > 0) //Heap[0] is not changed! + Sift(count, Count); //do not try to replace this code by make_heap + return 0; + } + + int Init(TIterator* iters, int count) { + TVector<TIterator*> a(count); + for (int i = 0; i < count; ++i) + a[i] = &iters[i]; + return Init(&a[0], count); + } + + THeapIter() + : Heap(nullptr) + , Count(0) + { + } + + THeapIter(TIterator* a, TIterator* b) + : Heap(nullptr) + , Count(0) + { + TIterator* arr[] = {a, b}; + if (Init(arr, 2)) + ythrow yexception() << "can't Init THeapIter"; + } + + THeapIter(TVector<TIterator>& v) + : Heap(nullptr) + , Count(0) + { + if (Init(&v[0], v.size())) { + ythrow yexception() << "can't Init THeapIter"; + } + } + + ~THeapIter() { + Term(); + } + + inline const TVal* Current() const { + if (!Count) + return nullptr; + return (*Heap)->Current(); + } + + inline const TIterator* CurrentIter() const { + return *Heap; + } + + //for ends of last file will use Heap[0] = Heap[0] ! and + //returns Current of eof so Current of eof MUST return NULL + //possible this is bug and need fixing + const TVal* Next() { + if (!Count) + return nullptr; + if (!(*Heap)->Next()) //on first call unitialized first TIterator + *Heap = Heap[--Count]; //will be correctly initialized + + if (Count == 2) { + if (TCompare()(Heap[1]->Current(), Heap[0]->Current())) + DoSwap(Heap[1], Heap[0]); + } else + Sift(0, Count); + + return Current(); + } + + inline bool GetExtInfo(typename TExtInfoType<TVal>::TResult* extInfo) const { + return (*Heap)->GetExtInfo(extInfo); + } + + inline const ui8* GetExtInfoRaw(size_t* len) const { + return (*Heap)->GetExtInfoRaw(len); + } + + void Term() { + ::free(Heap); + Heap = nullptr; + Count = 0; + } + +protected: + void Sift(int node, int end) { + TIterator* x = Heap[node]; + int son; + for (son = 2 * node + 1; son < end; node = son, son = 2 * node + 1) { + if (son < (end - 1) && TCompare()(Heap[son + 1]->Current(), Heap[son]->Current())) + son++; + if (TCompare()(Heap[son]->Current(), x->Current())) + Heap[node] = Heap[son]; + else + break; + } + Heap[node] = x; + } + + TIterator** Heap; + int Count; +}; + +/////////////////////////////////////////////////////////////////////////////// diff --git a/library/cpp/microbdb/input.h b/library/cpp/microbdb/input.h new file mode 100644 index 0000000000..a214ba6e8a --- /dev/null +++ b/library/cpp/microbdb/input.h @@ -0,0 +1,1027 @@ +#pragma once + +#include "header.h" +#include "file.h" +#include "reader.h" + +#include <util/system/maxlen.h> +#include <util/system/event.h> +#include <util/system/thread.h> + +#include <thread> + +#include <sys/uio.h> + +#include <errno.h> + +template <class TFileManip> +inline ssize_t Readv(TFileManip& fileManip, const struct iovec* iov, int iovcnt) { + ssize_t read_count = 0; + for (int n = 0; n < iovcnt; n++) { + ssize_t last_read = fileManip.Read(iov[n].iov_base, iov[n].iov_len); + if (last_read < 0) + return -1; + read_count += last_read; + } + return read_count; +} + +template <class TVal, typename TBasePageIter> +class TInputRecordIterator: public TBasePageIter { + typedef THolder<NMicroBDB::IBasePageReader<TVal>> TReaderHolder; + +public: + typedef TBasePageIter TPageIter; + + TInputRecordIterator() { + Init(); + } + + ~TInputRecordIterator() { + Term(); + } + + const TVal* Current() const { + return Rec; + } + + bool GetExtInfo(typename TExtInfoType<TVal>::TResult* extInfo) const { + if (!Rec) + return false; + return Reader->GetExtInfo(extInfo); + } + + const ui8* GetExtInfoRaw(size_t* len) const { + if (!Rec) + return nullptr; + return Reader->GetExtInfoRaw(len); + } + + size_t GetRecSize() const { + return Reader->GetRecSize(); + } + + size_t GetExtSize() const { + return Reader->GetExtSize(); + } + + const TVal* Next() { + if (RecNum) + --RecNum; + else { + TDatPage* page = TPageIter::Next(); + if (!page) { + if (TPageIter::IsFrozen() && Reader.Get()) + Reader->SetClearFlag(); + return Rec = nullptr; + } else if (!!SelectReader()) + return Rec = nullptr; + RecNum = TPageIter::Current()->RecNum - 1; + } + return Rec = Reader->Next(); + } + + // Skip(0) == Current(); Skip(1) == Next() + const TVal* Skip(int& num) { + // Y_ASSERT(num >= 0); ? otherwise it gets into infinite loop + while (num > RecNum) { + num -= RecNum + 1; + if (!TPageIter::Next() || !!SelectReader()) { + RecNum = 0; + return Rec = nullptr; + } + RecNum = TPageIter::Current()->RecNum - 1; + Rec = Reader->Next(); + } + ++num; + while (--num) + Next(); + return Rec; + } + + // begin reading from next page + void Reset() { + Rec = NULL; + RecNum = 0; + if (Reader.Get()) + Reader->Reset(); + } + +protected: + int Init() { + Rec = nullptr; + RecNum = 0; + Format = MBDB_FORMAT_NULL; + return 0; + } + + int Term() { + Reader.Reset(nullptr); + Format = MBDB_FORMAT_NULL; + Rec = nullptr; + RecNum = 0; + return 0; + } + + const TVal* GotoPage(int pageno) { + if (!TPageIter::GotoPage(pageno) || !!SelectReader()) + return Rec = nullptr; + RecNum = TPageIter::Current()->RecNum - 1; + return Rec = Reader->Next(); + } + + int SelectReader() { + if (!TPageIter::Current()) + return MBDB_UNEXPECTED_EOF; + if (ui32(Format) != TPageIter::Current()->Format) { + switch (TPageIter::Current()->Format) { + case MBDB_FORMAT_RAW: + Reader.Reset(new NMicroBDB::TRawPageReader<TVal, TPageIter>(this)); + break; + case MBDB_FORMAT_COMPRESSED: + Reader.Reset(new NMicroBDB::TCompressedReader<TVal, TPageIter>(this)); + break; + default: + return MBDB_NOT_SUPPORTED; + } + Format = EPageFormat(TPageIter::Current()->Format); + } else { + Y_ASSERT(Reader.Get() != nullptr); + Reader->Reset(); + } + return 0; + } + + const TVal* Rec; + TReaderHolder Reader; + int RecNum; //!< number of records on the current page after the current record + EPageFormat Format; +}; + +template <class TBaseReader> +class TInputPageIterator: public TBaseReader { +public: + typedef TBaseReader TReader; + + TInputPageIterator() + : Buf(nullptr) + { + Term(); + } + + ~TInputPageIterator() { + Term(); + } + + TDatPage* Current() { + return CurPage; + } + + int Freeze() { + return (Frozen = (PageNum == -1) ? 0 : PageNum); + } + + void Unfreeze() { + Frozen = -1; + } + + inline int IsFrozen() const { + return Frozen + 1; + } + + inline size_t GetPageSize() const { + return TReader::GetPageSize(); + } + + inline int GetPageNum() const { + return PageNum; + } + + inline int IsEof() const { + return Eof; + } + + TDatPage* Next() { + if (PageNum >= Maxpage && ReadBuf()) { + Eof = Eof ? Eof : TReader::IsEof(); + return CurPage = nullptr; + } + return CurPage = (TDatPage*)(Buf + ((++PageNum) % Bufpages) * GetPageSize()); + } + + TDatPage* GotoPage(int pageno) { + if (pageno <= Maxpage && pageno >= (Maxpage - Pages + 1)) { + PageNum = pageno; + return CurPage = (TDatPage*)(Buf + (PageNum % Bufpages) * GetPageSize()); + } + if (IsFrozen() || TReader::GotoPage(pageno)) + return nullptr; + Maxpage = PageNum = pageno - 1; + Eof = 0; + return Next(); + } + +protected: + int Init(size_t pages, int pagesOrBytes) { + Term(); + if (pagesOrBytes == -1) + Bufpages = TReader::GetLastPage(); + else if (pagesOrBytes) + Bufpages = pages; + else + Bufpages = pages / GetPageSize(); + if (!TReader::GetLastPage()) { + Bufpages = 0; + assert(Eof == 1); + return 0; + } + int lastPage = TReader::GetLastPage(); + if (lastPage >= 0) + Bufpages = (int)Min(lastPage, Bufpages); + Bufpages = Max(2, Bufpages); + Eof = 0; + ABuf.Alloc(Bufpages * GetPageSize()); + return (Buf = ABuf.Begin()) ? 0 : ENOMEM; + // return (Buf = (char*)malloc(Bufpages * GetPageSize())) ? 0 : ENOMEM; + } + + int Term() { + // free(Buf); + ABuf.Dealloc(); + Buf = nullptr; + Maxpage = PageNum = Frozen = -1; + Bufpages = 0; + Pages = 0; + Eof = 1; + CurPage = nullptr; + return 0; + } + + int ReadBuf() { + int nvec; + iovec vec[2]; + int maxpage = (Frozen == -1 ? Maxpage + 1 : Frozen) + Bufpages - 1; + int minpage = Maxpage + 1; + if (maxpage < minpage) + return EAGAIN; + minpage %= Bufpages; + maxpage %= Bufpages; + if (maxpage < minpage) { + vec[0].iov_base = Buf + GetPageSize() * minpage; + vec[0].iov_len = GetPageSize() * (Bufpages - minpage); + vec[1].iov_base = Buf; + vec[1].iov_len = GetPageSize() * (maxpage + 1); + nvec = 2; + } else { + vec[0].iov_base = Buf + GetPageSize() * minpage; + vec[0].iov_len = GetPageSize() * (maxpage - minpage + 1); + nvec = 1; + } + TReader::ReadPages(vec, nvec, &Pages); + Maxpage += Pages; + return !Pages; + } + + int Maxpage, PageNum, Frozen, Bufpages, Eof, Pages; + TDatPage* CurPage; + // TMappedArray<char> ABuf; + TMappedAllocation ABuf; + char* Buf; +}; + +template <class TBaseReader> +class TInputPageIteratorMT: public TBaseReader { +public: + typedef TBaseReader TReader; + + TInputPageIteratorMT() + : CurBuf(0) + , CurReadBuf(0) + , Buf(nullptr) + { + Term(); + } + + ~TInputPageIteratorMT() { + Term(); + } + + TDatPage* Current() { + return CurPage; + } + + int Freeze() { + return (Frozen = (PageNum == -1) ? 0 : PageNum); + } + + void Unfreeze() { + Frozen = -1; + } + + inline int IsFrozen() const { + return Frozen + 1; + } + + inline size_t GetPageSize() const { + return TReader::GetPageSize(); + } + + inline int GetPageNum() const { + return PageNum; + } + + inline int IsEof() const { + return Eof; + } + + TDatPage* Next() { + if (Eof) + return CurPage = nullptr; + if (PageNum >= Maxpage && ReadBuf()) { + Eof = Eof ? Eof : TReader::IsEof(); + return CurPage = nullptr; + } + return CurPage = (TDatPage*)(Buf + ((++PageNum) % Bufpages) * GetPageSize()); + } + + TDatPage* GotoPage(int pageno) { + if (pageno <= Maxpage && pageno >= (Maxpage - Pages + 1)) { + PageNum = pageno; + return CurPage = (TDatPage*)(Buf + (PageNum % Bufpages) * GetPageSize()); + } + if (IsFrozen() || TReader::GotoPage(pageno)) + return nullptr; + Maxpage = PageNum = pageno - 1; + Eof = 0; + return Next(); + } + + void ReadPages() { + // fprintf(stderr, "ReadPages started\n"); + bool eof = false; + while (!eof) { + QEvent[CurBuf].Wait(); + if (Finish) + return; + int pages = ReadCurBuf(Bufs[CurBuf]); + PagesM[CurBuf] = pages; + eof = !pages; + AEvent[CurBuf].Signal(); + CurBuf ^= 1; + } + } + +protected: + int Init(size_t pages, int pagesOrBytes) { + Term(); + if (pagesOrBytes == -1) + Bufpages = TReader::GetLastPage(); + else if (pagesOrBytes) + Bufpages = pages; + else + Bufpages = pages / GetPageSize(); + if (!TReader::GetLastPage()) { + Bufpages = 0; + assert(Eof == 1); + return 0; + } + int lastPage = TReader::GetLastPage(); + if (lastPage >= 0) + Bufpages = (int)Min(lastPage, Bufpages); + Bufpages = Max(2, Bufpages); + Eof = 0; + ABuf.Alloc(Bufpages * GetPageSize() * 2); + Bufs[0] = ABuf.Begin(); + Bufs[1] = Bufs[0] + Bufpages * GetPageSize(); + // return (Buf = (char*)malloc(Bufpages * GetPageSize())) ? 0 : ENOMEM; + Finish = false; + ReadThread = std::thread([this]() { + TThread::SetCurrentThreadName("DatReader"); + ReadPages(); + }); + QEvent[0].Signal(); + return Bufs[0] ? 0 : ENOMEM; + } + + void StopThread() { + Finish = true; + QEvent[0].Signal(); + QEvent[1].Signal(); + ReadThread.join(); + } + + int Term() { + // free(Buf); + if (ReadThread.joinable()) + StopThread(); + ABuf.Dealloc(); + Buf = nullptr; + Bufs[0] = nullptr; + Bufs[1] = nullptr; + Maxpage = MaxpageR = PageNum = Frozen = -1; + Bufpages = 0; + Pages = 0; + Eof = 1; + CurPage = nullptr; + return 0; + } + + int ReadCurBuf(char* buf) { + int nvec; + iovec vec[2]; + int maxpage = (Frozen == -1 ? MaxpageR + 1 : Frozen) + Bufpages - 1; + int minpage = MaxpageR + 1; + if (maxpage < minpage) + return EAGAIN; + minpage %= Bufpages; + maxpage %= Bufpages; + if (maxpage < minpage) { + vec[0].iov_base = buf + GetPageSize() * minpage; + vec[0].iov_len = GetPageSize() * (Bufpages - minpage); + vec[1].iov_base = buf; + vec[1].iov_len = GetPageSize() * (maxpage + 1); + nvec = 2; + } else { + vec[0].iov_base = buf + GetPageSize() * minpage; + vec[0].iov_len = GetPageSize() * (maxpage - minpage + 1); + nvec = 1; + } + int pages; + TReader::ReadPages(vec, nvec, &pages); + MaxpageR += pages; + return pages; + } + + int ReadBuf() { + QEvent[CurReadBuf ^ 1].Signal(); + AEvent[CurReadBuf].Wait(); + Buf = Bufs[CurReadBuf]; + Maxpage += (Pages = PagesM[CurReadBuf]); + CurReadBuf ^= 1; + return !Pages; + } + + int Maxpage, MaxpageR, PageNum, Frozen, Bufpages, Eof, Pages; + TDatPage* CurPage; + // TMappedArray<char> ABuf; + ui32 CurBuf; + ui32 CurReadBuf; + TMappedAllocation ABuf; + char* Buf; + char* Bufs[2]; + ui32 PagesM[2]; + TAutoEvent QEvent[2]; + TAutoEvent AEvent[2]; + std::thread ReadThread; + bool Finish; +}; + +template <typename TFileManip> +class TInputPageFileImpl: private TNonCopyable { +protected: + TFileManip FileManip; + +public: + TInputPageFileImpl() + : Pagesize(0) + , Fd(-1) + , Eof(1) + , Error(0) + , Pagenum(0) + , Recordsig(0) + { + Term(); + } + + ~TInputPageFileImpl() { + Term(); + } + + inline int IsEof() const { + return Eof; + } + + inline int GetError() const { + return Error; + } + + inline size_t GetPageSize() const { + return Pagesize; + } + + inline int GetLastPage() const { + return Pagenum; + } + + inline ui32 GetRecordSig() const { + return Recordsig; + } + + inline bool IsOpen() const { + return FileManip.IsOpen(); + } + +protected: + int Init(const char* fname, ui32 recsig, ui32* gotrecsig = nullptr, bool direct = false) { + Error = FileManip.Open(fname, direct); + return Error ? Error : Init(TFile(), recsig, gotrecsig); + } + + int Init(const TFile& file, ui32 recsig, ui32* gotrecsig = nullptr) { + if (!file.IsOpen() && !FileManip.IsOpen()) + return MBDB_NOT_INITIALIZED; + if (file.IsOpen() && FileManip.IsOpen()) + return MBDB_ALREADY_INITIALIZED; + if (file.IsOpen()) { + Error = FileManip.Init(file); + if (Error) + return Error; + } + + // TArrayHolder<ui8> buf(new ui8[METASIZE + FS_BLOCK_SIZE]); + // ui8* ptr = (buf.Get() + FS_BLOCK_SIZE - ((ui64)buf.Get() & (FS_BLOCK_SIZE - 1))); + TMappedArray<ui8> buf; + buf.Create(METASIZE); + ui8* ptr = &buf[0]; + TDatMetaPage* meta = (TDatMetaPage*)ptr; + ssize_t size = METASIZE; + ssize_t ret; + while (size && (ret = FileManip.Read(ptr, (unsigned)size)) > 0) { + Y_ASSERT(ret <= size); + size -= ret; + ptr += ret; + } + if (size) { + FileManip.Close(); + return Error = MBDB_BAD_METAPAGE; + } + if (gotrecsig) + *gotrecsig = meta->RecordSig; + return Init(TFile(), meta, recsig); + } + + int Init(TAutoPtr<IInputStream> input, ui32 recsig, ui32* gotrecsig = nullptr) { + if (!input && !FileManip.IsOpen()) + return MBDB_NOT_INITIALIZED; + if (FileManip.IsOpen()) + return MBDB_ALREADY_INITIALIZED; + + Error = FileManip.Open(input); + if (Error) + return Error; + + TArrayHolder<ui8> buf(new ui8[METASIZE]); + ui8* ptr = buf.Get(); + ssize_t size = METASIZE; + ssize_t ret; + while (size && (ret = FileManip.Read(ptr, (unsigned)size)) > 0) { + Y_ASSERT(ret <= size); + size -= ret; + ptr += ret; + } + if (size) { + FileManip.Close(); + return Error = MBDB_BAD_METAPAGE; + } + TDatMetaPage* meta = (TDatMetaPage*)buf.Get(); + if (gotrecsig) + *gotrecsig = meta->RecordSig; + return Init(TFile(), meta, recsig); + } + + int Init(const TFile& file, const TDatMetaPage* meta, ui32 recsig) { + if (!file.IsOpen() && !FileManip.IsOpen()) + return MBDB_NOT_INITIALIZED; + if (file.IsOpen() && FileManip.IsOpen()) + return MBDB_ALREADY_INITIALIZED; + if (file.IsOpen()) { + Error = FileManip.Init(file); + if (Error) + return Error; + } + + if (meta->MetaSig != METASIG) + Error = MBDB_BAD_METAPAGE; + else if (meta->RecordSig != recsig) + Error = MBDB_BAD_RECORDSIG; + + if (Error) { + FileManip.Close(); + return Error; + } + + i64 flength = FileManip.GetLength(); + if (flength >= 0) { + i64 fsize = flength; + fsize -= METASIZE; + if (fsize % meta->PageSize) + return Error = MBDB_BAD_FILE_SIZE; + Pagenum = (int)(fsize / meta->PageSize); + } else { + Pagenum = -1; + } + Pagesize = meta->PageSize; + Recordsig = meta->RecordSig; + Error = Eof = 0; + return Error; + } + + int ReadPages(iovec* vec, int nvec, int* pages) { + *pages = 0; + + if (Eof || Error) + return Error; + + ssize_t size = 0, delta = 0, total = 0; + iovec* pvec = vec; + int vsize = nvec; + + while (vsize && (size = Readv(FileManip, pvec, (int)Min(vsize, 16))) > 0) { + total += size; + if (delta) { + size += delta; + pvec->iov_len += delta; + pvec->iov_base = (char*)pvec->iov_base - delta; + delta = 0; + } + while (size) { + if ((size_t)size >= pvec->iov_len) { + size -= pvec->iov_len; + ++pvec; + --vsize; + } else { + delta = size; + pvec->iov_len -= size; + pvec->iov_base = (char*)pvec->iov_base + size; + size = 0; + } + } + } + if (delta) { + pvec->iov_len += delta; + pvec->iov_base = (char*)pvec->iov_base - delta; + } + if (size < 0) + return Error = errno ? errno : MBDB_READ_ERROR; + if (total % Pagesize) + return Error = MBDB_BAD_FILE_SIZE; + if (vsize) + Eof = 1; + *pages = total / Pagesize; // it would be better to assign it after the for-loops + for (; total; ++vec, total -= size) + for (size = 0; size < total && (size_t)size < vec->iov_len; size += Pagesize) + if (((TDatPage*)((char*)vec->iov_base + size))->PageSig != PAGESIG) + return Error = MBDB_BAD_PAGESIG; + return Error; + } + + int GotoPage(int page) { + if (Error) + return Error; + Eof = 0; + i64 offset = (i64)page * Pagesize + METASIZE; + if (offset != FileManip.Seek(offset, SEEK_SET)) + Error = MBDB_BAD_FILE_SIZE; + return Error; + } + + int Term() { + return FileManip.Close(); + } + + size_t Pagesize; + int Fd; + int Eof; + int Error; + int Pagenum; //!< number of pages in this file + ui32 Recordsig; +}; + +template <class TBaseReader> +class TMappedInputPageIterator: public TBaseReader { +public: + typedef TBaseReader TReader; + + TMappedInputPageIterator() { + Term(); + } + + ~TMappedInputPageIterator() { + Term(); + } + + TDatPage* Current() { + return CurPage; + } + + inline size_t GetPageSize() const { + return TReader::GetPageSize(); + } + + inline int GetPageNum() const { + return PageNum; + } + + inline int IsEof() const { + return Eof; + } + + inline int IsFrozen() const { + return 0; + } + + TDatPage* Next() { + i64 pos = (i64)(++PageNum) * GetPageSize() + METASIZE; + if (pos < 0 || pos >= (i64)TReader::GetSize()) { + Eof = 1; + return CurPage = nullptr; + } + return CurPage = (TDatPage*)((char*)TReader::GetData() + pos); + } + +protected: + int Init(size_t /*pages*/, int /*pagesOrBytes*/) { + Term(); + Eof = 0; + return 0; + } + + int Term() { + PageNum = -1; + Eof = 1; + CurPage = nullptr; + return 0; + } + + TDatPage* GotoPage(int pageno) { + PageNum = pageno - 1; + Eof = 0; + return Next(); + } + + int PageNum, Eof, Pages, Pagenum; + TDatPage* CurPage; +}; + +using TInputPageFile = TInputPageFileImpl<TInputFileManip>; + +template <class TVal, + typename TBaseRecIter = TInputRecordIterator<TVal, TInputPageIterator<TInputPageFile>>> +class TInDatFileImpl: public TBaseRecIter { +public: + typedef TBaseRecIter TRecIter; + typedef typename TRecIter::TPageIter TPageIter; + typedef typename TRecIter::TPageIter::TReader TReader; + using TRecIter::GotoPage; + + int Open(const char* fname, size_t pages = 1, int pagesOrBytes = 1, ui32* gotRecordSig = nullptr, bool direct = false) { + int ret = TReader::Init(fname, TVal::RecordSig, gotRecordSig, direct); + return ret ? ret : Open2(pages, pagesOrBytes); + } + + int Open(const TFile& file, size_t pages = 1, int pagesOrBytes = 1, ui32* gotRecordSig = nullptr) { + int ret = TReader::Init(file, TVal::RecordSig, gotRecordSig); + return ret ? ret : Open2(pages, pagesOrBytes); + } + + int Open(TAutoPtr<IInputStream> input, size_t pages = 1, int pagesOrBytes = 1, ui32* gotRecordSig = nullptr) { + int ret = TReader::Init(input, TVal::RecordSig, gotRecordSig); + return ret ? ret : Open2(pages, pagesOrBytes); + } + + int Open(const TFile& file, const TDatMetaPage* meta, size_t pages = 1, int pagesOrBytes = 1) { + int ret = TReader::Init(file, meta, TVal::RecordSig); + return ret ? ret : Open2(pages, pagesOrBytes); + } + + int Close() { + int ret1 = TRecIter::Term(); + int ret2 = TPageIter::Term(); + int ret3 = TReader::Term(); + return ret1 ? ret1 : ret2 ? ret2 : ret3; + } + + const TVal* GotoLastPage() { + return TReader::GetLastPage() <= 0 ? nullptr : TRecIter::GotoPage(TReader::GetLastPage() - 1); + } + +private: + int Open2(size_t pages, int pagesOrBytes) { + int ret = TPageIter::Init(pages, pagesOrBytes); + if (!ret) + ret = TRecIter::Init(); + if (ret) + Close(); + return ret; + } +}; + +template <class TVal> +class TInIndexFile: protected TInDatFileImpl<TVal> { + typedef TInDatFileImpl<TVal> TDatFile; + typedef typename TDatFile::TRecIter TRecIter; + typedef typename TRecIter::TPageIter TPageIter; + typedef typename TExtInfoType<TVal>::TResult TExtInfo; + +public: + using TDatFile::IsOpen; + + TInIndexFile() + : Index0(nullptr) + { + } + + int Open(const char* fname, size_t pages = 2, int pagesOrBytes = 1, ui32* gotRecordSig = nullptr) { + int ret = TDatFile::Open(fname, pages, pagesOrBytes, gotRecordSig); + if (ret) + return ret; + if (!(Index0 = (TDatPage*)malloc(TPageIter::GetPageSize()))) { + TDatFile::Close(); + return MBDB_NO_MEMORY; + } + if (!TExtInfoType<TVal>::Exists && SizeOf((TVal*)nullptr)) + RecsOnPage = (TPageIter::GetPageSize() - sizeof(TDatPage)) / DatCeil(SizeOf((TVal*)nullptr)); + TDatFile::Next(); + memcpy(Index0, TPageIter::Current(), TPageIter::GetPageSize()); + return 0; + } + + int Close() { + free(Index0); + Index0 = nullptr; + return TDatFile::Close(); + } + + inline int GetError() const { + return TDatFile::GetError(); + } + + int FindKey(const TVal* akey, const TExtInfo* extInfo = nullptr) { + assert(IsOpen()); + if (TExtInfoType<TVal>::Exists || !SizeOf((TVal*)nullptr)) + return FindVszKey(akey, extInfo); + int num = FindKeyOnPage(Index0, akey); + TDatPage* page = TPageIter::GotoPage(num + 1); + if (!page) + return 0; + num = FindKeyOnPage(page, akey); + num += (TPageIter::GetPageNum() - 1) * RecsOnPage; + return num; + } + + int FindVszKey(const TVal* akey, const TExtInfo* extInfo = NULL) { + int num = FindVszKeyOnPage(Index0, akey, extInfo); + int num_add = 0; + for (int p = 0; p < num; p++) { + TDatPage* page = TPageIter::GotoPage(p + 1); + if (!page) + return 0; + num_add += page->RecNum; + } + TDatPage* page = TPageIter::GotoPage(num + 1); + if (!page) + return 0; + num = FindVszKeyOnPage(page, akey, extInfo); + num += num_add; + return num; + } + +protected: + int FindKeyOnPage(TDatPage* page, const TVal* key) { + int left = 0; + int right = page->RecNum - 1; + int recsize = DatCeil(SizeOf((TVal*)nullptr)); + while (left < right) { + int middle = (left + right) >> 1; + if (*((TVal*)((char*)page + sizeof(TDatPage) + middle * recsize)) < *key) + left = middle + 1; + else + right = middle; + } + //borders check (left and right) + return (left == 0 || *((TVal*)((char*)page + sizeof(TDatPage) + left * recsize)) < *key) ? left : left - 1; + } + + // will deserialize rawExtinfoA to extInfoA only if necessery + inline bool KeyLess_(const TVal* a, const TVal* b, + TExtInfo* extInfoA, const TExtInfo* extInfoB, + const ui8* rawExtInfoA, size_t rawLen) { + if (*a < *b) { + return true; + } else if (!extInfoB || *b < *a) { + return false; + } else { + // *a == *b && extInfoB + Y_PROTOBUF_SUPPRESS_NODISCARD extInfoA->ParseFromArray(rawExtInfoA, rawLen); + return (*extInfoA < *extInfoB); + } + } + + int FindVszKeyOnPage(TDatPage* page, const TVal* key, const TExtInfo* extInfo) { + TVal* cur = (TVal*)((char*)page + sizeof(TDatPage)); + ui32 recnum = page->RecNum; + if (!TExtInfoType<TVal>::Exists) { + for (; recnum > 0 && *cur < *key; --recnum) + cur = (TVal*)((char*)cur + DatCeil(SizeOf(cur))); + } else { + size_t ll; + size_t l; + size_t sz = NMicroBDB::SizeOfExt(cur, &ll, &l); + TExtInfo ei; + for (; recnum > 0 && KeyLess_(cur, key, &ei, extInfo, (ui8*)cur + sz + ll, l); --recnum) { + cur = (TVal*)((ui8*)cur + DatCeil(sz + ll + l)); + sz = NMicroBDB::SizeOfExt(cur, &ll, &l); + } + } + + int idx = page->RecNum - recnum - 1; + return (idx >= 0) ? idx : 0; + } + + TDatPage* Index0; + int RecsOnPage; +}; + +template <class TVal, class TKey, class TPageIterator = TInputPageIterator<TInputPageFile>> +class TKeyFileMixin: public TInDatFileImpl<TVal, TInputRecordIterator<TVal, TPageIterator>> { +protected: + TInIndexFile<TKey> KeyFile; +}; + +template <class TVal, class TKey, class TBase = TKeyFileMixin<TVal, TKey>> +class TDirectInDatFile: public TBase { + typedef TBase TDatFile; + typedef typename TDatFile::TRecIter TRecIter; + typedef typename TDatFile::TPageIter TPageIter; + +public: + void Open(const char* path, size_t pages = 1, size_t keypages = 1, int pagesOrBytes = 1) { + int ret; + ui32 gotRecordSig = 0; + + ret = TDatFile::Open(path, pages, pagesOrBytes, &gotRecordSig); + if (ret) { + ythrow yexception() << ErrorMessage(ret, "Failed to open input file", path, TVal::RecordSig, gotRecordSig); + } + char KeyName[PATH_MAX + 1]; + if (DatNameToIdx(KeyName, path)) { + ythrow yexception() << ErrorMessage(MBDB_BAD_FILENAME, "Failed to open input file", path); + } + gotRecordSig = 0; + ret = KeyFile.Open(KeyName, keypages, 1, &gotRecordSig); + if (ret) { + ythrow yexception() << ErrorMessage(ret, "Failed to open input keyfile", KeyName, TKey::RecordSig, gotRecordSig); + } + } + + void Close() { + int ret; + + if (TDatFile::IsOpen() && (ret = TDatFile::GetError())) + if (!std::uncaught_exception()) + ythrow yexception() << ErrorMessage(ret, "Error before closing input file"); + if ((ret = TDatFile::Close())) + if (!std::uncaught_exception()) + ythrow yexception() << ErrorMessage(ret, "Error while closing input file"); + + if (KeyFile.IsOpen() && (ret = KeyFile.GetError())) + if (!std::uncaught_exception()) + ythrow yexception() << ErrorMessage(ret, "Error before closing input keyfile"); + if ((ret = KeyFile.Close())) + if (!std::uncaught_exception()) + ythrow yexception() << ErrorMessage(ret, "Error while closing input keyfile"); + } + + const TVal* FindRecord(const TKey* key, const typename TExtInfoType<TKey>::TResult* extInfo = nullptr) { + int page = KeyFile.FindKey(key, extInfo); + const TVal* val = TRecIter::GotoPage(page); + if (!TExtInfoType<TVal>::Exists || !extInfo) { + TKey k; + while (val) { + TMakeExtKey<TVal, TKey>::Make(&k, nullptr, val, nullptr); + if (!(k < *key)) + break; + val = TRecIter::Next(); + } + } else { + typename TExtInfoType<TVal>::TResult valExt; + TKey k; + typename TExtInfoType<TKey>::TResult kExt; + while (val) { + TRecIter::GetExtInfo(&valExt); + TMakeExtKey<TVal, TKey>::Make(&k, &kExt, val, &valExt); + if (*key < k || !(k < *key) && !(kExt < *extInfo)) // k > *key || k == *key && kExt >= *extInfo + break; + val = TRecIter::Next(); + } + } + return val; + } + + int FindPagesNo(const TKey* key, const typename TExtInfoType<TVal>::TResult* extInfo = NULL) { + return KeyFile.FindKey(key, extInfo); + } + +protected: + using TBase::KeyFile; +}; diff --git a/library/cpp/microbdb/microbdb.cpp b/library/cpp/microbdb/microbdb.cpp new file mode 100644 index 0000000000..c10dbdf126 --- /dev/null +++ b/library/cpp/microbdb/microbdb.cpp @@ -0,0 +1 @@ +#include "microbdb.h" diff --git a/library/cpp/microbdb/microbdb.h b/library/cpp/microbdb/microbdb.h new file mode 100644 index 0000000000..7521887337 --- /dev/null +++ b/library/cpp/microbdb/microbdb.h @@ -0,0 +1,54 @@ +#pragma once + +#include <util/folder/dirut.h> + +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4706) /*assignment within conditional expression*/ +#pragma warning(disable : 4267) /*conversion from 'size_t' to 'type', possible loss of data*/ +#endif + +#include "align.h" +#include "extinfo.h" +#include "header.h" +#include "reader.h" +#include "heap.h" +#include "file.h" +#include "sorter.h" +#include "input.h" +#include "output.h" +#include "sorterdef.h" + +inline int MakeSorterTempl(char path[/*FILENAME_MAX*/], const char* prefix) { + int ret = MakeTempDir(path, prefix); + if (!ret && strlcat(path, "%06d", FILENAME_MAX) > FILENAME_MAX - 100) + ret = EINVAL; + if (ret) + path[0] = 0; + return ret; +} + +inline int GetMeta(TFile& file, TDatMetaPage* meta) { + ui8 buf[METASIZE], *ptr = buf; + ssize_t size = sizeof(buf), ret; + while (size && (ret = file.Read(ptr, size)) > 0) { + size -= ret; + ptr += ret; + } + if (size) + return MBDB_BAD_FILE_SIZE; + ptr = buf; // gcc 4.4 warning fix + *meta = *(TDatMetaPage*)ptr; + return (meta->MetaSig == METASIG) ? 0 : MBDB_BAD_METAPAGE; +} + +template <class TRec> +inline bool IsDatFile(const char* fname) { + TDatMetaPage meta; + TFile f(fname, RdOnly); + return !GetMeta(f, &meta) && meta.RecordSig == TRec::RecordSig; +} + +#if defined(_MSC_VER) +#pragma warning(pop) +#endif diff --git a/library/cpp/microbdb/noextinfo.proto b/library/cpp/microbdb/noextinfo.proto new file mode 100644 index 0000000000..6a78882e07 --- /dev/null +++ b/library/cpp/microbdb/noextinfo.proto @@ -0,0 +1,4 @@ + +message TNoExtInfo { +} + diff --git a/library/cpp/microbdb/output.h b/library/cpp/microbdb/output.h new file mode 100644 index 0000000000..d0ecab2108 --- /dev/null +++ b/library/cpp/microbdb/output.h @@ -0,0 +1,1049 @@ +#pragma once + +#include "header.h" +#include "file.h" + +#include <util/generic/buffer.h> +#include <util/memory/tempbuf.h> + +#include <sys/uio.h> + +template <class TFileManip> +inline ssize_t Writev(TFileManip& fileManip, const struct iovec* iov, int iovcnt) { + ssize_t written_count = 0; + for (int n = 0; n < iovcnt; n++) { + ssize_t last_write = fileManip.Write(iov[n].iov_base, iov[n].iov_len); + if (last_write < 0) + return -1; + written_count += last_write; + } + return written_count; +} + +//********************************************************************* +struct TFakeIndexer { + inline void NextPage(TDatPage*) noexcept { + } +}; + +struct TCallbackIndexer { + typedef void (*TCallback)(void* This, const TDatPage* page); + + TCallbackIndexer() { + Callback = nullptr; + } + + void SetCallback(void* t, TCallback c) { + This = t; + Callback = c; + } + + void NextPage(TDatPage* dat) { + Callback(This, dat); + } + + TCallback Callback; + void* This; +}; + +template <class TVal, typename TBasePageIter, typename TBaseIndexer = TFakeIndexer, typename TCompressor = TFakeCompression> +class TOutputRecordIterator; + +template <class TVal, typename TBasePageIter, typename TBaseIndexer> +class TOutputRecordIterator<TVal, TBasePageIter, TBaseIndexer, TFakeCompression> + : public TBasePageIter, public TBaseIndexer { +public: + enum EOffset { + WrongOffset = size_t(-1) + }; + + typedef TBasePageIter TPageIter; + typedef TBaseIndexer TIndexer; + + TOutputRecordIterator() { + Clear(); + } + + ~TOutputRecordIterator() { + Term(); + } + + inline const TVal* Current() const { + return Rec; + } + + const TVal* Push(const TVal* v, const typename TExtInfoType<TVal>::TResult* extInfo = nullptr) { + NMicroBDB::AssertValid(v); + size_t len = SizeOf(v); + if (!TExtInfoType<TVal>::Exists) + return (Reserve(len)) ? (TVal*)memcpy(Rec, v, len) : nullptr; + else if (extInfo) { + size_t extSize = extInfo->ByteSize(); + size_t extLenSize = len_long((i64)extSize); + if (!Reserve(len + extLenSize + extSize)) + return nullptr; + memcpy(Rec, v, len); + out_long((i64)extSize, (char*)Rec + len); + extInfo->SerializeWithCachedSizesToArray((ui8*)Rec + len + extLenSize); + return Rec; + } else { + size_t extLenSize = len_long((i64)0); + if (!Reserve(len + extLenSize)) + return nullptr; + memcpy(Rec, v, len); + out_long((i64)0, (char*)Rec + len); + return Rec; + } + } + + const TVal* Push(const TVal* v, const ui8* extInfoRaw, size_t extLen) { + NMicroBDB::AssertValid(v); + size_t sz = SizeOf(v); + if (!Reserve(sz + extLen)) + return nullptr; + memcpy(Rec, v, sz); + memcpy((ui8*)Rec + sz, extInfoRaw, extLen); + return Rec; + } + + // use values stored in microbdb readers/writers internal buffer only. + // method expects serialized extInfo after this record + const TVal* PushWithExtInfo(const TVal* v) { + NMicroBDB::AssertValid(v); + size_t extSize; + size_t extLenSize; + size_t sz = NMicroBDB::SizeOfExt(v, &extLenSize, &extSize); + sz += extLenSize + extSize; + if (!Reserve(sz)) + return nullptr; + memcpy(Rec, v, sz); + return Rec; + } + + TVal* Reserve(size_t len) { + if (CurLen + DatCeil(len) > TPageIter::GetPageSize()) { + if (sizeof(TDatPage) + DatCeil(len) > TPageIter::GetPageSize()) + return Rec = nullptr; + if (TPageIter::Current() && RecNum) { + TPageIter::Current()->RecNum = RecNum; + TPageIter::Current()->Format = MBDB_FORMAT_RAW; + memset((char*)TPageIter::Current() + CurLen, 0, TPageIter::GetPageSize() - CurLen); + TIndexer::NextPage(TPageIter::Current()); + RecNum = 0; + } + if (!TPageIter::Next()) { + CurLen = TPageIter::GetPageSize(); + return Rec = nullptr; + } + CurLen = sizeof(TDatPage); + } + LenForOffset = CurLen; + Rec = (TVal*)((char*)TPageIter::Current() + CurLen); + DatSet(Rec, len); + + CurLen += DatCeil(len); + + ++RecNum; + return Rec; + } + + void Flush() { + TPageIter::Current()->RecNum = RecNum; + TPageIter::Current()->Format = MBDB_FORMAT_RAW; + } + + size_t Offset() const { + return Rec ? TPageIter::Offset() + LenForOffset : WrongOffset; + } + + void ResetDat() { + CurLen = (char*)Rec - (char*)TPageIter::Current(); + size_t len; + if (!TExtInfoType<TVal>::Exists) { + len = SizeOf(Rec); + } else { + size_t ll; + size_t l; + len = NMicroBDB::SizeOfExt(Rec, &ll, &l); + len += ll + l; + } + CurLen += DatCeil(len); + } + +protected: + void Clear() { + Rec = nullptr; + RecNum = 0; + CurLen = 0; + LenForOffset = 0; + } + + int Init() { + Clear(); + CurLen = TPageIter::GetPageSize(); + return 0; + } + + int Term() { + if (TPageIter::Current()) { + TPageIter::Current()->RecNum = RecNum; + TPageIter::Current()->Format = MBDB_FORMAT_RAW; + memset((char*)TPageIter::Current() + CurLen, 0, TPageIter::GetPageSize() - CurLen); + RecNum = 0; + } + int ret = !TPageIter::Current() && RecNum; + Clear(); + return ret; + } + + int GotoPage(int pageno) { + if (TPageIter::Current()) { + TPageIter::Current()->RecNum = RecNum; + TPageIter::Current()->Format = MBDB_FORMAT_RAW; + memset((char*)TPageIter::Current() + CurLen, 0, TPageIter::GetPageSize() - CurLen); + } + int ret = TPageIter::GotoPage(pageno); + if (!ret) { + RecNum = 0; + CurLen = sizeof(TDatPage); + } + return ret; + } + + TVal* Rec; + int RecNum; + size_t CurLen; + size_t LenForOffset; +}; + +template <class TVal, typename TBasePageIter, typename TBaseIndexer, typename TAlgorithm> +class TOutputRecordIterator + : public TBasePageIter, + public TBaseIndexer, + private TAlgorithm { + class TPageBuffer { + public: + void Init(size_t page) { + Pos = 0; + RecNum = 0; + Size = Min(page / 2, size_t(64 << 10)); + Data.Reset(new ui8[Size]); + } + + void Clear() { + Pos = 0; + RecNum = 0; + } + + inline bool Empty() const { + return RecNum == 0; + } + + public: + size_t Size; + size_t Pos; + int RecNum; + TArrayHolder<ui8> Data; + }; + +public: + typedef TBasePageIter TPageIter; + typedef TBaseIndexer TIndexer; + + TOutputRecordIterator() + : Rec(nullptr) + , RecNum(0) + { + } + + ~TOutputRecordIterator() { + Term(); + } + + const TVal* Current() const { + return Rec; + } + + const TVal* Push(const TVal* v, const typename TExtInfoType<TVal>::TResult* extInfo = nullptr) { + NMicroBDB::AssertValid(v); + size_t len = SizeOf(v); + if (!TExtInfoType<TVal>::Exists) + return (Reserve(len)) ? (TVal*)memcpy((TVal*)Rec, v, len) : nullptr; + else if (extInfo) { + size_t extSize = extInfo->ByteSize(); + size_t extLenSize = len_long((i64)extSize); + if (!Reserve(len + extLenSize + extSize)) + return nullptr; + memcpy(Rec, v, len); + out_long((i64)extSize, (char*)Rec + len); + extInfo->SerializeWithCachedSizesToArray((ui8*)Rec + len + extLenSize); + return Rec; + } else { + size_t extLenSize = len_long((i64)0); + if (!Reserve(len + extLenSize)) + return nullptr; + memcpy(Rec, v, len); + out_long((i64)0, (char*)Rec + len); + return Rec; + } + } + + const TVal* Push(const TVal* v, const ui8* extInfoRaw, size_t extLen) { + NMicroBDB::AssertValid(v); + size_t sz = SizeOf(v); + if (!Reserve(sz + extLen)) + return NULL; + memcpy(Rec, v, sz); + memcpy((ui8*)Rec + sz, extInfoRaw, extLen); + return Rec; + } + + // use values stored in microbdb readers/writers internal buffer only. + // method expects serialized extInfo after this record + const TVal* PushWithExtInfo(const TVal* v) { + NMicroBDB::AssertValid(v); + size_t extSize; + size_t extLenSize; + size_t sz = NMicroBDB::SizeOfExt(v, &extLenSize, &extSize); + sz += extLenSize + extSize; + if (!Reserve(sz)) + return nullptr; + memcpy(Rec, v, sz); + return Rec; + } + + TVal* Reserve(const size_t len) { + const size_t aligned = DatCeil(len); + + if (!TPageIter::Current()) { // Allocate fist page + if (!TPageIter::Next()) { + CurLen = TPageIter::GetPageSize(); + return Rec = nullptr; + } + CurLen = sizeof(TDatPage) + sizeof(TCompressedPage); + } + + if (Buffer.Pos + aligned > Buffer.Size) { + if (Buffer.Pos == 0) + return Rec = nullptr; + if (FlushBuffer()) + return Rec = nullptr; + if (Buffer.Pos + aligned + sizeof(TDatPage) + sizeof(TCompressedPage) > Buffer.Size) + return Rec = nullptr; + } + + Rec = (TVal*)((char*)Buffer.Data.Get() + Buffer.Pos); + DatSet(Rec, len); // len is correct because DatSet set align tail to zero + + Buffer.RecNum++; + Buffer.Pos += aligned; + ++RecNum; + return Rec; + } + + void Flush() { + if (!Buffer.Empty()) { + FlushBuffer(); + TPageIter::Current()->RecNum = RecNum; + TPageIter::Current()->Format = MBDB_FORMAT_COMPRESSED; + } + } + + size_t Offset() const { + // According to vadya@ there is no evil to return 0 all the time + return 0; + } + + void ResetDat() { + Buffer.Pos = (char*)Rec - (char*)Buffer.Data.Get(); + size_t len = SizeOf(Rec); + Buffer.Pos += DatCeil(len); + } + +protected: + void Clear() { + RecNum = 0; + Rec = nullptr; + Count = 0; + CurLen = sizeof(TDatPage) + sizeof(TCompressedPage); + Buffer.Clear(); + } + + int Init() { + Clear(); + Buffer.Init(TPageIter::GetPageSize()); + TAlgorithm::Init(); + return 0; + } + + int Term() { + if (TPageIter::Current()) + Commit(); + int ret = !TPageIter::Current() && RecNum; + Clear(); + TAlgorithm::Term(); + return ret; + } + + int GotoPage(int pageno) { + if (TPageIter::Current()) + Commit(); + int ret = TPageIter::GotoPage(pageno); + if (!ret) + Reset(); + return ret; + } + +private: + void Commit() { + Flush(); + TPageIter::Current()->RecNum = RecNum; + TPageIter::Current()->Format = MBDB_FORMAT_COMPRESSED; + SetCompressedPageHeader(); + + memset((char*)TPageIter::Current() + CurLen, 0, TPageIter::GetPageSize() - CurLen); + RecNum = 0; + Count = 0; + } + + inline void SetCompressedPageHeader() { + TCompressedPage* const hdr = (TCompressedPage*)((ui8*)TPageIter::Current() + sizeof(TDatPage)); + + hdr->BlockCount = Count; + hdr->Algorithm = TAlgorithm::Code; + hdr->Version = 0; + hdr->Reserved = 0; + } + + inline void Reset() { + RecNum = 0; + CurLen = sizeof(TDatPage) + sizeof(TCompressedPage); + Count = 0; + Buffer.Clear(); + } + + int FlushBuffer() { + TArrayHolder<ui8> data; + const ui8* const buf = Buffer.Data.Get(); + size_t first = 0; + + if (!TExtInfoType<TVal>::Exists) + first = DatCeil(SizeOf((TVal*)buf)); + else { + size_t ll; + size_t l; + first = NMicroBDB::SizeOfExt((const TVal*)buf, &ll, &l); + first = DatCeil(first + ll + l); + } + + size_t total = sizeof(NMicroBDB::TCompressedHeader) + first + ((Buffer.RecNum == 1) ? 0 : TAlgorithm::CompressBound(Buffer.Pos - first)); + size_t real = total; + + { + ui8* p = nullptr; + NMicroBDB::TCompressedHeader* hdr = nullptr; + + // 1. Choose data destination (temporary buffer or dat-page) + if (CurLen + total > TPageIter::GetPageSize()) { + data.Reset(new ui8[total]); + + hdr = (NMicroBDB::TCompressedHeader*)data.Get(); + p = data.Get() + sizeof(NMicroBDB::TCompressedHeader); + } else { + p = (ui8*)TPageIter::Current() + CurLen; + hdr = (NMicroBDB::TCompressedHeader*)p; + p += sizeof(NMicroBDB::TCompressedHeader); + } + + // 2. Compress data + + // Fill header and first record + hdr->Original = Buffer.Pos; + hdr->Compressed = 0; + hdr->Count = Buffer.RecNum; + hdr->Reserved = 0; + memcpy(p, Buffer.Data.Get(), first); + // Fill compressed part + if (Buffer.RecNum > 1) { + size_t size = TAlgorithm::CompressBound(Buffer.Pos - first); + + p += first; + TAlgorithm::Compress(p, size, buf + first, Buffer.Pos - first); + + hdr->Compressed = size; + + real = sizeof(NMicroBDB::TCompressedHeader) + first + size; + } + } + + Y_ASSERT(sizeof(TDatPage) + sizeof(TCompressedPage) + real <= TPageIter::GetPageSize()); + + // 3. Check page capacity + + if (CurLen + real > TPageIter::GetPageSize()) { + Y_ASSERT(data.Get() != nullptr); + + if (TPageIter::Current() && RecNum) { + RecNum = RecNum - Buffer.RecNum; + TPageIter::Current()->RecNum = RecNum; + TPageIter::Current()->Format = MBDB_FORMAT_COMPRESSED; + SetCompressedPageHeader(); + memset((char*)TPageIter::Current() + CurLen, 0, TPageIter::GetPageSize() - CurLen); + TIndexer::NextPage(TPageIter::Current()); + RecNum = Buffer.RecNum; + Count = 0; + } + if (!TPageIter::Next()) { + CurLen = TPageIter::GetPageSize(); + return MBDB_NO_MEMORY; + } + CurLen = sizeof(TDatPage) + sizeof(TCompressedPage); + } + + // 4. Flush data and reset buffer state + + if (data.Get()) + memcpy((ui8*)TPageIter::Current() + CurLen, data.Get(), real); + CurLen += real; + ++Count; + Buffer.Clear(); + return 0; + } + +private: + size_t CurLen; + TPageBuffer Buffer; + TVal* Rec; + ui32 Count; //! < count of compressed blocks on page +public: + int RecNum; +}; + +template <typename TBaseWriter> +class TOutputPageIterator: public TBaseWriter { +public: + typedef TBaseWriter TWriter; + + TOutputPageIterator() + : Buf(nullptr) + { + Clear(); + } + + ~TOutputPageIterator() { + Term(); + } + + TDatPage* Current() { + return CurPage; + } + + size_t Offset() const { + //Cout << "PS = " << TWriter::GetPageSize() << "; PN = " << PageNum << "; MS = " << METASIZE << Endl; + return TWriter::GetPageSize() * PageNum + METASIZE; + } + + int Freeze() { + return (Frozen = (PageNum == -1) ? 0 : (int)PageNum); + } + + void Unfreeze() { + Frozen = -1; + } + + inline int IsFrozen() const { + return Frozen + 1; + } + + inline size_t GetPageSize() const { + return TWriter::GetPageSize(); + } + + inline int GetPageNum() const { + return (int)PageNum; + } + + TDatPage* Next() { + if (PageNum >= Maxpage && WriteBuf()) + return CurPage = nullptr; + CurPage = (TDatPage*)(Buf + ((++PageNum) % Bufpages) * GetPageSize()); + memset(CurPage, 0, sizeof(TDatPage)); + return CurPage; + } + +protected: + int Init(size_t pages, int pagesOrBytes) { + Term(); + if (pagesOrBytes) + Bufpages = pages; + else + Bufpages = pages / GetPageSize(); + Bufpages = Max<size_t>(1, Bufpages); + Maxpage = Bufpages - 1; + // if (!(Buf = (char*)malloc(Bufpages * GetPageSize()))) + // return ENOMEM; + ABuf.Alloc(Bufpages * GetPageSize()); + Buf = ABuf.Begin(); + if (TWriter::Memo) + Freeze(); + return 0; + } + + int Term() { + Unfreeze(); + int ret = (PageNum < 0) ? 0 : WriteBuf(); + Clear(); + return ret; + } + + int GotoPage(int pageno) { + int ret = EAGAIN; + if (IsFrozen() || PageNum >= 0 && ((ret = WriteBuf())) || ((ret = TWriter::GotoPage(pageno)))) + return ret; + PageNum = pageno; + Maxpage = Bufpages - 1 + pageno; + CurPage = (TDatPage*)(Buf + (PageNum % Bufpages) * GetPageSize()); + memset(CurPage, 0, sizeof(TDatPage)); + return 0; + } + + void Clear() { + ABuf.Dealloc(); + Buf = nullptr; + Maxpage = PageNum = Frozen = -1; + Bufpages = 0; + CurPage = nullptr; + } + + int WriteBuf() { + int nvec; + iovec vec[2]; + ssize_t minpage = Maxpage - Bufpages + 1; + ssize_t maxpage = Frozen == -1 ? PageNum : Frozen - 1; + if (maxpage < minpage) + return EAGAIN; + minpage %= Bufpages; + maxpage %= Bufpages; + if (maxpage < minpage) { + vec[0].iov_base = Buf + GetPageSize() * minpage; + vec[0].iov_len = GetPageSize() * (Bufpages - minpage); + vec[1].iov_base = Buf; + vec[1].iov_len = GetPageSize() * (maxpage + 1); + nvec = 2; + } else { + vec[0].iov_base = Buf + GetPageSize() * minpage; + vec[0].iov_len = GetPageSize() * (maxpage - minpage + 1); + nvec = 1; + } + if (TWriter::WritePages(vec, nvec)) + return EIO; + Maxpage += (maxpage < minpage) ? (Bufpages - minpage + maxpage + 1) : (maxpage - minpage + 1); + return 0; + } + + ssize_t Maxpage; + ssize_t Bufpages; + ssize_t PageNum; + int Frozen; + TDatPage* CurPage; + char* Buf; + TMappedAllocation ABuf; +}; + +template <class TFileManip> +class TOutputPageFileImpl: private TNonCopyable { +public: + TOutputPageFileImpl() + : Pagesize(0) + , Eof(1) + , Error(0) + , Memo(0) + , Recordsig(0) + { + } + + ~TOutputPageFileImpl() { + Term(); + } + + inline int IsEof() const { + return Eof; + } + + inline int GetError() const { + return Error; + } + + inline bool IsOpen() const { + return FileManip.IsOpen(); + } + + inline size_t GetPageSize() const { + return Pagesize; + } + + inline ui32 GetRecordSig() const { + return Recordsig; + } + + int Init(const char* fname, size_t pagesize, ui32 recsig, bool direct = false) { + Memo = 0; + if (FileManip.IsOpen()) + return MBDB_ALREADY_INITIALIZED; + + if (!fname) { + Eof = Error = 0; + Pagesize = pagesize; + Recordsig = recsig; + Memo = 1; + return 0; + } + + Error = FileManip.Open(fname, WrOnly | CreateAlways | ARW | AWOther | (direct ? DirectAligned : EOpenMode())); + if (Error) + return Error; + Error = Init(TFile(), pagesize, recsig); + if (Error) { + FileManip.Close(); + unlink(fname); + } + return Error; + } + + int Init(TAutoPtr<IOutputStream> output, size_t pagesize, ui32 recsig) { + Memo = 0; + if (FileManip.IsOpen()) { + return MBDB_ALREADY_INITIALIZED; + } + + if (!output) { + Eof = Error = 0; + Pagesize = pagesize; + Recordsig = recsig; + Memo = 1; + return 0; + } + + Error = FileManip.Open(output); + if (Error) + return Error; + Error = Init(TFile(), pagesize, recsig); + if (Error) { + FileManip.Close(); + } + return Error; + } + + int Init(const TFile& file, size_t pagesize, ui32 recsig) { + Memo = 0; + if (!file.IsOpen() && !FileManip.IsOpen()) + return MBDB_NOT_INITIALIZED; + if (file.IsOpen() && FileManip.IsOpen()) + return MBDB_ALREADY_INITIALIZED; + if (file.IsOpen()) { + Error = FileManip.Init(file); + if (Error) + return Error; + } + + Eof = 1; + TTempBuf buf(METASIZE + FS_BLOCK_SIZE); + const char* ptr = (buf.Data() + FS_BLOCK_SIZE - ((ui64)buf.Data() & (FS_BLOCK_SIZE - 1))); + TDatMetaPage* meta = (TDatMetaPage*)ptr; + + memset(buf.Data(), 0, buf.Size()); + meta->MetaSig = METASIG; + meta->PageSize = Pagesize = pagesize; + meta->RecordSig = Recordsig = recsig; + + ssize_t size = METASIZE, ret = 0; + while (size && (ret = FileManip.Write(ptr, (unsigned)size)) > 0) { + size -= ret; + ptr += ret; + } + if (size || ret <= 0) { + Term(); + return Error = errno ? errno : MBDB_WRITE_ERROR; + } + + Error = Eof = 0; + return Error; + } + +protected: + int WritePages(iovec* vec, int nvec) { + if (Error || Memo) + return Error; + + ssize_t size, delta; + iovec* pvec; + int vsize; + + for (vsize = 0, pvec = vec; vsize < nvec; vsize++, pvec++) + for (size = 0; (size_t)size < pvec->iov_len; size += Pagesize) + ((TDatPage*)((char*)pvec->iov_base + size))->PageSig = PAGESIG; + + delta = size = 0; + pvec = vec; + vsize = nvec; + while (vsize && (size = Writev(FileManip, pvec, (int)Min(vsize, 16))) > 0) { + if (delta) { + size += delta; + pvec->iov_len += delta; + pvec->iov_base = (char*)pvec->iov_base - delta; + delta = 0; + } + while (size) { + if ((size_t)size >= pvec->iov_len) { + size -= pvec->iov_len; + ++pvec; + --vsize; + } else { + delta = size; + pvec->iov_len -= size; + pvec->iov_base = (char*)pvec->iov_base + size; + size = 0; + } + } + } + if (delta) { + pvec->iov_len += delta; + pvec->iov_base = (char*)pvec->iov_base - delta; + } + return Error = (!size && !vsize) ? 0 : errno ? errno : MBDB_WRITE_ERROR; + } + + i64 Tell() { + return FileManip.RealSeek(0, SEEK_CUR); + } + + int GotoPage(int pageno) { + if (Error || Memo) + return Error; + Eof = 0; + i64 offset = (i64)pageno * Pagesize + METASIZE; + if (offset != FileManip.Seek(offset, SEEK_SET)) + Error = MBDB_BAD_FILE_SIZE; + return Error; + } + + int Term() { + int ret = FileManip.Close(); + Eof = 1; + Memo = 0; + if (!Error) + Error = ret; + return Error; + } + + size_t Pagesize; + int Eof; + int Error; + int Memo; + ui32 Recordsig; + +private: + TFileManip FileManip; +}; + +using TOutputPageFile = TOutputPageFileImpl<TOutputFileManip>; + +template <class TVal, + typename TBaseRecIter = TOutputRecordIterator<TVal, TOutputPageIterator<TOutputPageFile>>> +class TOutDatFileImpl: public TBaseRecIter { +public: + typedef TBaseRecIter TRecIter; + typedef typename TRecIter::TPageIter TPageIter; + typedef typename TRecIter::TPageIter::TWriter TWriter; + + int Open(const char* fname, size_t pagesize, size_t pages = 1, int pagesOrBytes = 1, bool direct = false) { + int ret = TWriter::Init(fname, pagesize, TVal::RecordSig, direct); + return ret ? ret : Open2(pages, pagesOrBytes); + } + + int Open(const TFile& file, size_t pagesize, size_t pages = 1, int pagesOrBytes = 1) { + int ret = TWriter::Init(file, pagesize, TVal::RecordSig); + return ret ? ret : Open2(pages, pagesOrBytes); + } + + int Open(TAutoPtr<IOutputStream> output, size_t pagesize, size_t pages = 1, int pagesOrBytes = 1) { + int ret = TWriter::Init(output, pagesize, TVal::RecordSig); + return ret ? ret : Open2(pages, pagesOrBytes); + } + + int Close() { + int ret1 = TRecIter::Term(); + int ret2 = TPageIter::Term(); + int ret3 = TWriter::Term(); + return ret1 ? ret1 : ret2 ? ret2 : ret3; + } + +private: + int Open2(size_t pages, int pagesOrBytes) { + int ret = TPageIter::Init(pages, pagesOrBytes); + if (!ret) + ret = TRecIter::Init(); + if (ret) + Close(); + return ret; + } +}; + +template <class TVal> +class TOutIndexFile: public TOutDatFileImpl< + TVal, + TOutputRecordIterator<TVal, TOutputPageIterator<TOutputPageFile>, TCallbackIndexer, TFakeCompression>> { + typedef TOutDatFileImpl< + TVal, + TOutputRecordIterator<TVal, TOutputPageIterator<TOutputPageFile>, TCallbackIndexer, TFakeCompression>> + TDatFile; + typedef TOutIndexFile<TVal> TMyType; + typedef typename TDatFile::TRecIter TRecIter; + typedef typename TRecIter::TPageIter TPageIter; + typedef typename TRecIter::TIndexer TIndexer; + +public: + TOutIndexFile() { + TIndexer::SetCallback(this, DispatchCallback); + } + + int Open(const char* fname, size_t pagesize, size_t pages, int pagesOrBytes = 1) { + int ret = TDatFile::Open(fname, pagesize, pages, pagesOrBytes); + if (ret) + return ret; + if ((ret = TRecIter::GotoPage(1))) { + TDatFile::Close(); + return ret; + } + Index0.Clear(); + return ret; + } + + int Close() { + TPageIter::Unfreeze(); + if (TRecIter::RecNum) { + TRecIter::Flush(); + NextPage(TPageIter::Current()); + } + int ret = 0; + if (Index0.Size() && !(ret = TRecIter::GotoPage(0))) { + const char* ptr = Index0.Begin(); + size_t recSize; + while (ptr < Index0.End()) { + Y_ASSERT((size_t)(Index0.End() - ptr) >= sizeof(size_t)); + memcpy(&recSize, ptr, sizeof(size_t)); + ptr += sizeof(size_t); + Y_ASSERT((size_t)(Index0.End() - ptr) >= recSize); + ui8* buf = (ui8*)TRecIter::Reserve(recSize); + if (!buf) { + ret = MBDB_PAGE_OVERFLOW; + break; + } + memcpy(buf, ptr, recSize); + TRecIter::ResetDat(); + ptr += recSize; + } + Index0.Clear(); + ret = (TPageIter::GetPageNum() != 0) ? MBDB_PAGE_OVERFLOW : TPageIter::GetError(); + } + int ret1 = TDatFile::Close(); + return ret ? ret : ret1; + } + +protected: + TBuffer Index0; + + void NextPage(const TDatPage* page) { + const TVal* first = (const TVal*)NMicroBDB::GetFirstRecord(page); + size_t sz; + if (!TExtInfoType<TVal>::Exists) { + sz = SizeOf(first); + } else { + size_t ll; + size_t l; + sz = NMicroBDB::SizeOfExt(first, &ll, &l); + sz += ll + l; + } + Index0.Append((const char*)&sz, sizeof(size_t)); + Index0.Append((const char*)first, sz); + } + + static void DispatchCallback(void* This, const TDatPage* page) { + ((TMyType*)This)->NextPage(page); + } +}; + +template <class TVal, class TKey, typename TCompressor = TFakeCompression, class TPageFile = TOutputPageFile> +class TOutDirectFileImpl: public TOutDatFileImpl< + TVal, + TOutputRecordIterator<TVal, TOutputPageIterator<TPageFile>, TCallbackIndexer, TCompressor>> { + typedef TOutDatFileImpl< + TVal, + TOutputRecordIterator<TVal, TOutputPageIterator<TPageFile>, TCallbackIndexer, TCompressor>> + TDatFile; + typedef TOutDirectFileImpl<TVal, TKey, TCompressor, TPageFile> TMyType; + typedef typename TDatFile::TRecIter TRecIter; + typedef typename TRecIter::TPageIter TPageIter; + typedef typename TRecIter::TIndexer TIndexer; + typedef TOutIndexFile<TKey> TKeyFile; + +public: + TOutDirectFileImpl() { + TIndexer::SetCallback(this, DispatchCallback); + } + + int Open(const char* fname, size_t pagesize, int pages = 1, size_t ipagesize = 0, size_t ipages = 1, int pagesOrBytes = 1) { + char iname[FILENAME_MAX]; + int ret; + if (ipagesize == 0) + ipagesize = pagesize; + ret = TDatFile::Open(fname, pagesize, pages, pagesOrBytes); + ret = ret ? ret : DatNameToIdx(iname, fname); + ret = ret ? ret : KeyFile.Open(iname, ipagesize, ipages, pagesOrBytes); + if (ret) + TDatFile::Close(); + return ret; + } + + int Close() { + if (TRecIter::RecNum) { + TRecIter::Flush(); + NextPage(TPageIter::Current()); + } + int ret = KeyFile.Close(); + int ret1 = TDatFile::Close(); + return ret1 ? ret1 : ret; + } + + int GetError() const { + return TDatFile::GetError() ? TDatFile::GetError() : KeyFile.GetError(); + } + +protected: + TKeyFile KeyFile; + + void NextPage(const TDatPage* page) { + typedef TMakeExtKey<TVal, TKey> TMakeExtKey; + + TVal* val = (TVal*)NMicroBDB::GetFirstRecord(page); + TKey key; + if (!TMakeExtKey::Exists) { + TMakeExtKey::Make(&key, nullptr, val, nullptr); + KeyFile.Push(&key); + } else { + size_t ll; + size_t l; + size_t sz = NMicroBDB::SizeOfExt(val, &ll, &l); + typename TExtInfoType<TVal>::TResult valExt; + if (TExtInfoType<TVal>::Exists) + Y_PROTOBUF_SUPPRESS_NODISCARD valExt.ParseFromArray((ui8*)val + sz + ll, l); + typename TExtInfoType<TKey>::TResult keyExt; + TMakeExtKey::Make(&key, &keyExt, val, &valExt); + KeyFile.Push(&key, &keyExt); + } + } + + static void DispatchCallback(void* This, const TDatPage* page) { + ((TMyType*)This)->NextPage(page); + } +}; diff --git a/library/cpp/microbdb/powersorter.h b/library/cpp/microbdb/powersorter.h new file mode 100644 index 0000000000..c40de9c23f --- /dev/null +++ b/library/cpp/microbdb/powersorter.h @@ -0,0 +1,667 @@ +#pragma once + +#include "safeopen.h" + +#include <util/generic/vector.h> +#include <util/generic/deque.h> +#include <util/system/mutex.h> +#include <util/system/condvar.h> +#include <util/thread/pool.h> + +template < + class TRecord, + template <typename T> class TCompare, + class TSieve, + class TMemoFile = TOutDatFile<TRecord>> +class TDatSorterBuf { +public: + typedef TRecord TRec; + typedef TVector<TRec*> TVectorType; + typedef TMemoFile TMemo; + typedef TCompare<TRecord> TComp; + +public: + TDatSorterBuf(size_t memory, size_t pageSize) + : Memo("memo", pageSize, memory, 0) + , Cur() + { + Memo.Open(nullptr); + Memo.Freeze(); + } + + ~TDatSorterBuf() { + Vector.clear(); + Memo.Close(); + } + + const TRec* Push(const TRec* v) { + const TRec* u = Memo.Push(v); + if (u) + Vector.push_back((TRec*)u); + return u; + } + + const TRec* Next() { + if (Ptr == Vector.end()) { + if (Cur) + TSieve::Sieve(Cur, Cur); + Cur = nullptr; + } else { + Cur = *Ptr++; + if (!TIsSieveFake<TSieve>::Result) + while (Ptr != Vector.end() && TSieve::Sieve(Cur, *Ptr)) + ++Ptr; + } + return Cur; + } + + const TRec* Current() { + return Cur; + } + + size_t Size() { + return Vector.size(); + } + + void Sort() { + Ptr = Vector.begin(); + Cur = nullptr; + + MBDB_SORT_FUN(Vector.begin(), Vector.end(), TComp()); + } + + void Clear() { + Vector.clear(); + Memo.Freeze(); + Ptr = Vector.begin(); + Cur = nullptr; + } + +private: + TVectorType Vector; + TMemo Memo; + + typename TVectorType::iterator + Ptr; + TRec* Cur; +}; + +template < + class TRecord, + class TInput, + template <typename T> class TCompare, + class TSieve> +class TDatMerger { +public: + typedef TRecord TRec; + typedef TCompare<TRecord> TComp; + typedef TSimpleSharedPtr<TInput> TInputPtr; + typedef TVector<TInputPtr> TInputVector; + +public: + ~TDatMerger() { + Close(); + } + + void Init(const TInputVector& inputs) { + Inputs = inputs; + TVector<TInput*> v; + for (int i = 0; i < Inputs.ysize(); ++i) + v.push_back(Inputs[i].Get()); + HeapIter.Init(&v[0], v.size()); + if (!TIsSieveFake<TSieve>::Result) + PNext = HeapIter.Next(); + } + + const TRec* Next() { + if (TIsSieveFake<TSieve>::Result) { + return HeapIter.Next(); + } + + if (!PNext) { + if (PCur) { + TSieve::Sieve(PCur, PCur); + PCur = nullptr; + } + return nullptr; + } + + PCur = &Cur; + memcpy(PCur, PNext, SizeOf((const TRec*)PNext)); + + do { + PNext = HeapIter.Next(); + } while (PNext && TSieve::Sieve(PCur, PNext)); + + return PCur; + } + + const TRec* Current() { + return (TIsSieveFake<TSieve>::Result ? HeapIter.Current() : PCur); + } + + void Close() { + Inputs.clear(); + HeapIter.Term(); + } + +private: + TInputVector Inputs; + THeapIter<TRec, TInput, TComp> HeapIter; + TRec Cur; + TRec* PCur = nullptr; + const TRec* PNext = nullptr; +}; + +class TPortionManager { +public: + void Open(const char* tempDir) { + TGuard<TMutex> guard(Mutex); + TempDir = tempDir; + } + + TString Next() { + TGuard<TMutex> guard(Mutex); + if (Portions == 0) + DoOpen(); + TString fname = GeneratePortionFilename(Portions++); + return fname; + } + + void Close() { + TGuard<TMutex> guard(Mutex); + Portions = 0; + } + +private: + void DoOpen() { + if (MakeSorterTempl(PortionFilenameTempl, TempDir.data())) { + PortionFilenameTempl[0] = 0; + ythrow yexception() << "portion-manager: bad tempdir \"" << TempDir.data() << "\": " << LastSystemErrorText(); + } + } + + TString GeneratePortionFilename(int i) { + char str[FILENAME_MAX]; + snprintf(str, sizeof(str), PortionFilenameTempl, i); + return TString(str); + } + +private: + TMutex Mutex; + + TString TempDir; + char PortionFilenameTempl[FILENAME_MAX] = {}; + int Portions = 0; +}; + +// A merger powered by threads +template < + class TRecord, + template <typename T> class TCompare, + class TSieve, + class TInput = TInDatFile<TRecord>, + class TOutput = TOutDatFile<TRecord>> +class TPowerMerger { +public: + typedef TRecord TRec; + typedef TDatMerger<TRecord, TInput, TCompare, TSieve> TMerger; + typedef TSimpleSharedPtr<TMerger> TMergerPtr; + typedef TPowerMerger<TRecord, TCompare, TSieve, TInput, TOutput> TFileMerger; + + struct TMergePortionTask: public IObjectInQueue { + TFileMerger* FileMerger; + int Begin; + int End; + TString OutFname; + + TMergePortionTask(TFileMerger* fileMerger, int begin, int end, const TString& outFname) + : FileMerger(fileMerger) + , Begin(begin) + , End(end) + , OutFname(outFname) + { + } + + void Process(void*) override { + THolder<TMergePortionTask> This(this); + //fprintf(stderr, "MergePortion: (%i, %i, %s)\n", Begin, End, ~OutFname); + FileMerger->MergePortion(Begin, End, OutFname); + } + }; + +public: + TPowerMerger(const TSimpleSharedPtr<TThreadPool>& mtpQueue, const TSimpleSharedPtr<TPortionManager>& portMan, + int memory, int pageSize, bool autoUnlink) + : MtpQueue(mtpQueue) + , PortionManager(portMan) + , Memory(memory) + , PageSize(pageSize) + , AutoUnlink(autoUnlink) + { + } + + TPowerMerger(const TSimpleSharedPtr<TThreadPool>& mtpQueue, const char* tempDir, + int memory, int pageSize, bool autoUnlink) + : MtpQueue(mtpQueue) + , PortionManager(new TPortionManager) + , Memory(memory) + , PageSize(pageSize) + , AutoUnlink(autoUnlink) + { + PortionManager->Open(tempDir); + } + + ~TPowerMerger() { + Close(); + } + + void SetMtpQueue(const TSimpleSharedPtr<TThreadPool>& mtpQueue) { + MtpQueue = mtpQueue; + } + + void MergePortion(int begin, int end, const TString& outFname) { + TMerger merger; + InitMerger(merger, begin, end); + + TOutput out("mergeportion-tmpout", PageSize, BufSize, 0); + out.Open(outFname.data()); + const TRec* rec; + while ((rec = merger.Next())) + out.Push(rec); + out.Close(); + + merger.Close(); + + { + TGuard<TMutex> guard(Mutex); + UnlinkFiles(begin, end); + Files.push_back(outFname); + --Tasks; + TaskFinishedCond.Signal(); + } + } + + void Add(const TString& fname) { + TGuard<TMutex> guard(Mutex); + // fprintf(stderr, "TPowerMerger::Add: %s\n", ~fname); + Files.push_back(fname); + if (InitialFilesEnd > 0) + ythrow yexception() << "TPowerMerger::Add: no more files allowed"; + } + + void Merge(int maxPortions) { + TGuard<TMutex> guard(Mutex); + InitialFilesEnd = Files.ysize(); + if (!InitialFilesEnd) + ythrow yexception() << "TPowerMerger::Merge: no files added"; + Optimize(maxPortions); + MergeMT(); + InitMerger(Merger, CPortions, Files.ysize()); + } + + void Close() { + TGuard<TMutex> guard(Mutex); + Merger.Close(); + UnlinkFiles(CPortions, Files.ysize()); + InitialFilesEnd = CPortions = 0; + Files.clear(); + } + + const TRec* Next() { + return Merger.Next(); + } + + const TRec* Current() { + return Merger.Current(); + } + + int FileCount() const { + TGuard<TMutex> guard(Mutex); + return Files.ysize(); + } + +private: + void InitMerger(TMerger& merger, int begin, int end) { + TGuard<TMutex> guard(Mutex); + TVector<TSimpleSharedPtr<TInput>> inputs; + for (int i = begin; i < end; ++i) { + inputs.push_back(new TInput("mergeportion-tmpin", BufSize, 0)); + inputs.back()->Open(Files[i]); + // fprintf(stderr, "InitMerger: %i, %s\n", i, ~Files[i]); + } + merger.Init(inputs); + } + + void UnlinkFiles(int begin, int end) { + TGuard<TMutex> guard(Mutex); + for (int i = begin; i < end; ++i) { + if (i >= InitialFilesEnd || AutoUnlink) + unlink(Files[i].c_str()); + } + } + + void Optimize(int maxPortions, size_t maxBufSize = 4u << 20) { + TGuard<TMutex> guard(Mutex); + maxPortions = std::min(maxPortions, Memory / PageSize - 1); + maxBufSize = std::max((size_t)PageSize, maxBufSize); + + if (maxPortions <= 2) { + FPortions = MPortions = 2; + BufSize = PageSize; + return; + } + + int Portions = Files.ysize(); + if (maxPortions >= Portions) { + FPortions = MPortions = Portions; + } else if (((Portions + maxPortions - 1) / maxPortions) <= maxPortions) { + while (((Portions + maxPortions - 1) / maxPortions) <= maxPortions) + --maxPortions; + MPortions = ++maxPortions; + int total = ((Portions + MPortions - 1) / MPortions) + Portions; + FPortions = (total % MPortions) ? (total % MPortions) : (int)MPortions; + } else + FPortions = MPortions = maxPortions; + + BufSize = std::min((size_t)(Memory / (MPortions + 1)), maxBufSize); + // fprintf(stderr, "Optimize: Portions=%i; MPortions=%i; FPortions=%i; Memory=%i; BufSize=%i\n", + // (int)Portions, (int)MPortions, (int)FPortions, (int)Memory, (int)BufSize); + } + + void MergeMT() { + TGuard<TMutex> guard(Mutex); + do { + int n; + while ((n = Files.ysize() - CPortions) > MPortions) { + int m = std::min((CPortions == 0 ? (int)FPortions : (int)MPortions), n); + TString fname = PortionManager->Next(); + if (!MtpQueue->Add(new TMergePortionTask(this, CPortions, CPortions + m, fname))) + ythrow yexception() << "TPowerMerger::MergeMT: failed to add task"; + CPortions += m; + ++Tasks; + } + if (Tasks > 0) + TaskFinishedCond.Wait(Mutex); + } while (Tasks > 0); + } + +private: + TMutex Mutex; + TCondVar TaskFinishedCond; + + TMerger Merger; + TSimpleSharedPtr<TThreadPool> MtpQueue; + TSimpleSharedPtr<TPortionManager> PortionManager; + TVector<TString> Files; + int Tasks = 0; + int InitialFilesEnd = 0; + int CPortions = 0; + int MPortions = 0; + int FPortions = 0; + int Memory = 0; + int PageSize = 0; + int BufSize = 0; + bool AutoUnlink = false; +}; + +// A sorter powered by threads +template < + class TRecord, + template <typename T> class TCompare, + class TSieve = TFakeSieve<TRecord>, + class TTmpInput = TInDatFile<TRecord>, + class TTmpOutput = TOutDatFile<TRecord>> +class TPowerSorter { +public: + typedef TPowerSorter<TRecord, TCompare, TSieve, TTmpInput, TTmpOutput> TSorter; + typedef TRecord TRec; + typedef TTmpOutput TTmpOut; + typedef TTmpInput TTmpIn; + typedef TDatSorterBuf<TRecord, TCompare, TSieve> TSorterBuf; + typedef TCompare<TRecord> TComp; + typedef TPowerMerger<TRecord, TCompare, TSieve, TTmpInput, TTmpOutput> TFileMerger; + + struct TSortPortionTask: public IObjectInQueue { + TSorter* Sorter; + TSorterBuf* SorterBuf; + int Portion; + + TSortPortionTask(TSorter* sorter, TSorterBuf* sorterBuf, int portion) + : Sorter(sorter) + , SorterBuf(sorterBuf) + , Portion(portion) + { + } + + void Process(void*) override { + TAutoPtr<TSortPortionTask> This(this); + // fprintf(stderr, "SortPortion: %i\n", Portion); + Sorter->SortPortion(SorterBuf); + } + }; + + class TSorterBufQueue { + private: + TMutex Mutex; + TCondVar Cond; + TVector<TSimpleSharedPtr<TSorterBuf>> V; + TDeque<TSorterBuf*> Q; + + int Memory, PageSize, MaxSorterBufs; + + public: + TSorterBufQueue(int memory, int pageSize, int maxSorterBufs) + : Memory(memory) + , PageSize(pageSize) + , MaxSorterBufs(maxSorterBufs) + { + } + + void Push(TSorterBuf* sb) { + TGuard<TMutex> guard(Mutex); + sb->Clear(); + Q.push_back(sb); + Cond.Signal(); + } + + TSorterBuf* Pop() { + TGuard<TMutex> guard(Mutex); + if (!Q.size() && V.ysize() < MaxSorterBufs) { + V.push_back(new TSorterBuf(Memory / MaxSorterBufs, PageSize)); + return V.back().Get(); + } else { + while (!Q.size()) + Cond.Wait(Mutex); + TSorterBuf* t = Q.front(); + Q.pop_front(); + return t; + } + } + + void Clear() { + TGuard<TMutex> guard(Mutex); + Q.clear(); + V.clear(); + } + + void WaitAll() { + TGuard<TMutex> guard(Mutex); + while (Q.size() < V.size()) { + Cond.Wait(Mutex); + } + } + + int GetMaxSorterBufs() const { + return MaxSorterBufs; + } + }; + +public: + TPowerSorter(const TSimpleSharedPtr<TThreadPool>& mtpQueue, size_t maxSorterBufs, + const char* name, size_t memory, size_t pageSize, size_t bufSize) + : MaxSorterBufs(maxSorterBufs) + , Name(name) + , Memory(memory) + , PageSize(pageSize) + , BufSize(bufSize) + , MtpQueue(mtpQueue) + , PortionManager(new TPortionManager) + , SBQueue(Memory, PageSize, MaxSorterBufs) + , FileMerger(MtpQueue, PortionManager, Memory, PageSize, true) + { + } + + TPowerSorter(size_t maxSorterBufs, + const char* name, size_t memory, size_t pageSize, size_t bufSize) + : MaxSorterBufs(maxSorterBufs) + , Name(name) + , Memory(memory) + , PageSize(pageSize) + , BufSize(bufSize) + , PortionManager(new TPortionManager) + , SBQueue(Memory, PageSize, maxSorterBufs) + , FileMerger(MtpQueue, PortionManager, Memory, PageSize, true) + { + } + + TPowerSorter(const char* name, size_t memory, size_t pageSize, size_t bufSize) + : MaxSorterBufs(5) + , Name(name) + , Memory(memory) + , PageSize(pageSize) + , BufSize(bufSize) + , PortionManager(new TPortionManager) + , SBQueue(Memory, PageSize, MaxSorterBufs) + , FileMerger(MtpQueue, PortionManager, Memory, PageSize, true) + { + } + + ~TPowerSorter() { + Close(); + } + + void Open(const char* tempDir) { + Close(); + CurSB = SBQueue.Pop(); + PortionManager->Open(tempDir); + } + + void Reopen(const char* fname) { + Open(fname); + } + + void Close() { + CurSB = nullptr; + SBQueue.Clear(); + PortionCount = 0; + FileMerger.Close(); + PortionManager->Close(); + } + + const TRec* Push(const TRec* v) { + CheckOpen("Push"); + const TRec* u = CurSB->Push(v); + if (!u) { + NextPortion(); + u = CurSB->Push(v); + } + return u; + } + + void Sort(int maxPortions = 1000) { + CheckOpen("Sort"); + if (!PortionCount) { + CurSB->Sort(); + } else { + NextPortion(); + SBQueue.Push(CurSB); + CurSB = nullptr; + SBQueue.WaitAll(); + SBQueue.Clear(); + FileMerger.Merge(maxPortions); + } + } + + const TRec* Next() { + return PortionCount ? FileMerger.Next() : CurSB->Next(); + } + + const TRec* Current() { + return PortionCount ? FileMerger.Current() : CurSB->Current(); + } + + int GetBufSize() const { + return BufSize; + } + + int GetPageSize() const { + return PageSize; + } + + const char* GetName() const { + return Name.data(); + } + +private: + void CheckOpen(const char* m) { + if (!CurSB) + ythrow yexception() << "TPowerSorter::" << m << ": the sorter is not open"; + } + + void NextPortion() { + if (!CurSB->Size()) + return; + ++PortionCount; + if (MaxSorterBufs <= 1) { + SortPortion(CurSB); + } else { + if (!MtpQueue.Get()) { + MtpQueue.Reset(new TThreadPool); + MtpQueue->Start(MaxSorterBufs - 1); + FileMerger.SetMtpQueue(MtpQueue); + } + if (!MtpQueue->Add(new TSortPortionTask(this, CurSB, PortionCount))) + ythrow yexception() << "TPowerSorter::NextPortion: failed to add task"; + } + CurSB = SBQueue.Pop(); + } + + void SortPortion(TSorterBuf* sorterBuf) { + TString portionFilename = PortionManager->Next(); + try { + sorterBuf->Sort(); + + // fprintf(stderr, "TPowerSorter::SortPortion: -> %s\n", ~portionFilename); + TTmpOut out("powersorter-portion", PageSize, BufSize, 0); + out.Open(portionFilename.data()); + + while (sorterBuf->Next()) + out.Push(sorterBuf->Current()); + + out.Close(); + FileMerger.Add(portionFilename); + SBQueue.Push(sorterBuf); + } catch (const yexception& e) { + unlink(portionFilename.data()); + ythrow yexception() << "SortPortion: " << e.what(); + } + } + +private: + int MaxSorterBufs = 0; + TString Name; + int Memory = 0; + int PageSize = 0; + int BufSize = 0; + + TMutex Mutex; + TSimpleSharedPtr<TThreadPool> MtpQueue; + TSimpleSharedPtr<TPortionManager> PortionManager; + + TSorterBufQueue SBQueue; + TSorterBuf* CurSB = nullptr; + int PortionCount = 0; + + TFileMerger FileMerger; +}; diff --git a/library/cpp/microbdb/reader.h b/library/cpp/microbdb/reader.h new file mode 100644 index 0000000000..694a2f1766 --- /dev/null +++ b/library/cpp/microbdb/reader.h @@ -0,0 +1,354 @@ +#pragma once + +#include "align.h" +#include "header.h" +#include "extinfo.h" + +#include <contrib/libs/zlib/zlib.h> +#include <contrib/libs/fastlz/fastlz.h> +#include <contrib/libs/snappy/snappy.h> + +#include <util/generic/vector.h> +#include <util/memory/tempbuf.h> + +namespace NMicroBDB { + static const size_t DEFAULT_BUFFER_SIZE = (64 << 10); + + //! + template <class TVal> + class IBasePageReader { + public: + virtual size_t GetRecSize() const = 0; + virtual size_t GetExtSize() const = 0; + virtual bool GetExtInfo(typename TExtInfoType<TVal>::TResult* extInfo) const = 0; + virtual const ui8* GetExtInfoRaw(size_t* len) const = 0; + virtual const TVal* Next() = 0; + virtual void Reset() = 0; + //! set clearing flag, so temporary buffers will be cleared + //! in next call of Next() + virtual void SetClearFlag() { + } + + virtual ~IBasePageReader() { + } + }; + + template <class TVal, typename TPageIter> + class TRawPageReader: public IBasePageReader<TVal> { + public: + TRawPageReader(TPageIter* const iter) + : PageIter(iter) + { + Reset(); + } + + bool GetExtInfo(typename TExtInfoType<TVal>::TResult* extInfo) const override { + Y_VERIFY(TExtInfoType<TVal>::Exists, "GetExtInfo should only be used with extended records"); + if (!Rec) + return false; + ui8* raw = (ui8*)Rec + RecSize + ExtLenSize; + return extInfo->ParseFromArray(raw, ExtSize); + } + + size_t GetRecSize() const override { + return RecSize + ExtLenSize; + } + + size_t GetExtSize() const override { + return ExtSize; + } + + const ui8* GetExtInfoRaw(size_t* len) const override { + Y_VERIFY(TExtInfoType<TVal>::Exists, "GetExtInfo should only be used with extended records"); + if (!Rec) { + *len = 0; + return nullptr; + } + *len = ExtLenSize + ExtSize; + return (ui8*)Rec + RecSize; + } + + const TVal* Next() override { + if (!Rec) + Rec = (TVal*)((char*)PageIter->Current() + sizeof(TDatPage)); + else + Rec = (TVal*)((char*)Rec + DatCeil(RecSize + ExtLenSize + ExtSize)); + if (!TExtInfoType<TVal>::Exists) + RecSize = SizeOf(Rec); + else + RecSize = SizeOfExt(Rec, &ExtLenSize, &ExtSize); + return Rec; + } + + void Reset() override { + Rec = nullptr; + RecSize = 0; + ExtLenSize = 0; + ExtSize = 0; + } + + private: + const TVal* Rec; + size_t RecSize; + size_t ExtLenSize; + size_t ExtSize; + TPageIter* const PageIter; + }; + + template <class TVal, typename TPageIter> + class TCompressedReader: public IBasePageReader<TVal> { + inline size_t GetFirstRecordSize(const TVal* const in) const { + if (!TExtInfoType<TVal>::Exists) { + return DatCeil(SizeOf(in)); + } else { + size_t ll; + size_t l; + size_t ret = SizeOfExt(in, &ll, &l); + + return DatCeil(ret + ll + l); + } + } + + void DecompressBlock() { + if (PageIter->IsFrozen() && Buffer.Get()) + Blocks.push_back(Buffer.Release()); + + const TCompressedHeader* hdr = (const TCompressedHeader*)(Page); + + Page += sizeof(TCompressedHeader); + + const size_t first = GetFirstRecordSize((const TVal*)Page); + + if (!Buffer.Get() || Buffer->Size() < hdr->Original) + Buffer.Reset(new TTempBuf(Max<size_t>(hdr->Original, DEFAULT_BUFFER_SIZE))); + + memcpy(Buffer->Data(), Page, first); + Page += first; + + if (hdr->Count > 1) { + switch (Algo) { + case MBDB_COMPRESSION_ZLIB: { + uLongf dst = hdr->Original - first; + + int ret = uncompress((Bytef*)Buffer->Data() + first, &dst, Page, hdr->Compressed); + + if (ret != Z_OK) + ythrow yexception() << "error then uncompress " << ret; + } break; + case MBDB_COMPRESSION_FASTLZ: { + int dst = hdr->Original - first; + int ret = yfastlz_decompress(Page, hdr->Compressed, Buffer->Data() + first, dst); + + if (!ret) + ythrow yexception() << "error then uncompress"; + } break; + case MBDB_COMPRESSION_SNAPPY: { + if (!snappy::RawUncompress((const char*)Page, hdr->Compressed, Buffer->Data() + first)) + ythrow yexception() << "error then uncompress"; + } break; + } + } + + Rec = nullptr; + RecNum = hdr->Count; + Page += hdr->Compressed; + } + + void ClearBuffer() { + for (size_t i = 0; i < Blocks.size(); ++i) + delete Blocks[i]; + Blocks.clear(); + ClearFlag = false; + } + + public: + TCompressedReader(TPageIter* const iter) + : Rec(nullptr) + , RecSize(0) + , ExtLenSize(0) + , ExtSize(0) + , Page(nullptr) + , PageIter(iter) + , RecNum(0) + , BlockNum(0) + , ClearFlag(false) + { + } + + ~TCompressedReader() override { + ClearBuffer(); + } + + size_t GetRecSize() const override { + return RecSize + ExtLenSize; + } + + size_t GetExtSize() const override { + return ExtSize; + } + + bool GetExtInfo(typename TExtInfoType<TVal>::TResult* extInfo) const override { + Y_VERIFY(TExtInfoType<TVal>::Exists, "GetExtInfo should only be used with extended records"); + if (!Rec) + return false; + ui8* raw = (ui8*)Rec + RecSize + ExtLenSize; + return extInfo->ParseFromArray(raw, ExtSize); + } + + const ui8* GetExtInfoRaw(size_t* len) const override { + Y_VERIFY(TExtInfoType<TVal>::Exists, "GetExtInfo should only be used with extended records"); + if (!Rec) { + *len = 0; + return nullptr; + } + *len = ExtLenSize + ExtSize; + return (ui8*)Rec + RecSize; + } + + const TVal* Next() override { + Y_ASSERT(RecNum >= 0); + + if (ClearFlag) + ClearBuffer(); + + if (!Page) { + if (!PageIter->Current()) + return nullptr; + + Page = (ui8*)PageIter->Current() + sizeof(TDatPage); + + BlockNum = ((TCompressedPage*)Page)->BlockCount - 1; + Algo = (ECompressionAlgorithm)((TCompressedPage*)Page)->Algorithm; + Page += sizeof(TCompressedPage); + + DecompressBlock(); + } + + if (!RecNum) { + if (BlockNum <= 0) + return nullptr; + else { + --BlockNum; + DecompressBlock(); + } + } + + --RecNum; + if (!Rec) + Rec = (const TVal*)Buffer->Data(); + else + Rec = (const TVal*)((char*)Rec + DatCeil(RecSize + ExtLenSize + ExtSize)); + + if (!TExtInfoType<TVal>::Exists) + RecSize = SizeOf(Rec); + else + RecSize = SizeOfExt(Rec, &ExtLenSize, &ExtSize); + + return Rec; + } + + void Reset() override { + Page = nullptr; + BlockNum = 0; + Rec = nullptr; + RecSize = 0; + ExtLenSize = 0; + ExtSize = 0; + RecNum = 0; + } + + void SetClearFlag() override { + ClearFlag = true; + } + + public: + THolder<TTempBuf> Buffer; + TVector<TTempBuf*> Blocks; + const TVal* Rec; + size_t RecSize; + size_t ExtLenSize; + size_t ExtSize; + const ui8* Page; + TPageIter* const PageIter; + int RecNum; //!< count of recs in current block + int BlockNum; + ECompressionAlgorithm Algo; + bool ClearFlag; + }; + + class TZLibCompressionImpl { + public: + static const ECompressionAlgorithm Code = MBDB_COMPRESSION_ZLIB; + + inline void Init() { + // - + } + + inline void Term() { + // - + } + + inline size_t CompressBound(size_t size) const noexcept { + return ::compressBound(size); + } + + inline void Compress(void* out, size_t& outSize, const void* in, size_t inSize) { + uLongf size = outSize; + + if (compress((Bytef*)out, &size, (const Bytef*)in, inSize) != Z_OK) + ythrow yexception() << "not compressed"; + outSize = size; + } + }; + + class TFastlzCompressionImpl { + public: + static const ECompressionAlgorithm Code = MBDB_COMPRESSION_FASTLZ; + + inline void Init() { + // - + } + + inline void Term() { + // - + } + + inline size_t CompressBound(size_t size) const noexcept { + size_t rval = size_t(size * 1.07); + return rval < 66 ? 66 : rval; + } + + inline void Compress(void* out, size_t& outSize, const void* in, size_t inSize) { + outSize = yfastlz_compress_level(2, in, inSize, out); + if (!outSize) + ythrow yexception() << "not compressed"; + } + }; + + class TSnappyCompressionImpl { + public: + static const ECompressionAlgorithm Code = MBDB_COMPRESSION_SNAPPY; + + inline void Init() { + // - + } + + inline void Term() { + // - + } + + inline size_t CompressBound(size_t size) const noexcept { + return snappy::MaxCompressedLength(size); + } + + inline void Compress(void* out, size_t& outSize, const void* in, size_t inSize) { + snappy::RawCompress((const char*)in, inSize, (char*)out, &outSize); + } + }; + +} + +using TFakeCompression = void; +using TZLibCompression = NMicroBDB::TZLibCompressionImpl; +using TFastlzCompression = NMicroBDB::TFastlzCompressionImpl; +using TSnappyCompression = NMicroBDB::TSnappyCompressionImpl; diff --git a/library/cpp/microbdb/safeopen.h b/library/cpp/microbdb/safeopen.h new file mode 100644 index 0000000000..c328ffd575 --- /dev/null +++ b/library/cpp/microbdb/safeopen.h @@ -0,0 +1,792 @@ +#pragma once + +// util +#include <util/generic/yexception.h> +#include <util/generic/vector.h> +#include <util/string/util.h> +#include <util/system/mutex.h> +#include <thread> + +#include "microbdb.h" + +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4706) /*assignment within conditional expression*/ +#pragma warning(disable : 4267) /*conversion from 'size_t' to 'type', possible loss of data*/ +#endif + +template <typename TVal, typename TPageFile = TInputPageFile, typename TIterator = TInputPageIterator<TPageFile>> +class TInDatFile: protected TInDatFileImpl<TVal, TInputRecordIterator<TVal, TIterator>> { +public: + typedef TVal TRec; + typedef TInDatFileImpl<TVal, TInputRecordIterator<TVal, TIterator>> TBase; + + TInDatFile(const TString& name, size_t pages, int pagesOrBytes = 1) + : Name(name) + , Pages(pages) + , PagesOrBytes(pagesOrBytes) + { + } + + ~TInDatFile() { + Close(); + } + + void Open(const TString& fname, bool direct = false) { + ui32 gotRecordSig = 0; + int ret = TBase::Open(fname.data(), Pages, PagesOrBytes, &gotRecordSig, direct); + if (ret) { + // XXX: print record type name, not type sig + ythrow yexception() << ErrorMessage(ret, "Failed to open input file", fname, TVal::RecordSig, gotRecordSig); + } + Name = fname; + } + + void OpenStream(TAutoPtr<IInputStream> input) { + ui32 gotRecordSig = 0; + int ret = TBase::Open(input, Pages, PagesOrBytes, &gotRecordSig); + if (ret) { + // XXX: print record type name, not type sig + ythrow yexception() << ErrorMessage(ret, "Failed to open input file", Name, TVal::RecordSig, gotRecordSig); + } + } + + void Close() { + int ret; + if (IsOpen() && (ret = TBase::GetError())) + if (!std::uncaught_exception()) + ythrow yexception() << ErrorMessage(ret, "Error before closing input file", Name); + if ((ret = TBase::Close())) + if (!std::uncaught_exception()) + ythrow yexception() << ErrorMessage(ret, "Error while closing input file", Name); + } + + const char* GetName() const { + return Name.data(); + } + + using TBase::Current; + using TBase::Freeze; + using TBase::GetError; + using TBase::GetExtInfo; + using TBase::GetExtInfoRaw; + using TBase::GetExtSize; + using TBase::GetLastPage; + using TBase::GetPageNum; + using TBase::GetPageSize; + using TBase::GetRecSize; + using TBase::GotoLastPage; + using TBase::GotoPage; + using TBase::IsEof; + using TBase::IsOpen; + using TBase::Next; + using TBase::Skip; + using TBase::Unfreeze; + +protected: + TString Name; + size_t Pages; + int PagesOrBytes; +}; + +template <typename TVal> +class TMappedInDatFile: protected TInDatFileImpl<TVal, TInputRecordIterator<TVal, TMappedInputPageIterator<TMappedInputPageFile>>> { +public: + typedef TVal TRec; + typedef TInDatFileImpl<TVal, TInputRecordIterator<TVal, TMappedInputPageIterator<TMappedInputPageFile>>> TBase; + + TMappedInDatFile(const TString& name, size_t /* pages */, int /* pagesOrBytes */) + : Name(name) + { + } + + ~TMappedInDatFile() { + Close(); + } + + void Open(const TString& fname) { + int ret = TBase::Open(fname.data()); + if (ret) + ythrow yexception() << ErrorMessage(ret, "Failed to open mapped file", fname, TVal::RecordSig); + Name = fname; + } + + void Close() { + int ret; + if (IsOpen() && (ret = TBase::GetError())) + if (!std::uncaught_exception()) + ythrow yexception() << ErrorMessage(ret, "Error before closing mapped file", Name); + if ((ret = TBase::Close())) + if (!std::uncaught_exception()) + ythrow yexception() << ErrorMessage(ret, "Error while closing mapped file", Name); + } + + const char* GetName() const { + return Name.data(); + } + + using TBase::Current; + using TBase::GetError; + using TBase::GetExtInfo; + using TBase::GetExtInfoRaw; + using TBase::GetLastPage; + using TBase::GetPageNum; + using TBase::GetPageSize; + using TBase::GotoLastPage; + using TBase::GotoPage; + using TBase::IsEof; + using TBase::IsOpen; + using TBase::Next; + using TBase::Skip; + +protected: + TString Name; +}; + +template <typename TVal, typename TCompressor = TFakeCompression, typename TPageFile = TOutputPageFile> +class TOutDatFile: protected TOutDatFileImpl<TVal, TOutputRecordIterator<TVal, TOutputPageIterator<TPageFile>, TFakeIndexer, TCompressor>> { +public: + typedef TOutDatFileImpl<TVal, TOutputRecordIterator<TVal, TOutputPageIterator<TPageFile>, TFakeIndexer, TCompressor>> TBase; + + TOutDatFile(const TString& name, size_t pagesize, size_t pages, int pagesOrBytes = 1) + : Name(name) + , PageSize(pagesize) + , Pages(pages) + , PagesOrBytes(pagesOrBytes) + { + } + + ~TOutDatFile() { + Close(); + } + + void Open(const char* fname, bool direct = false) { + int ret = TBase::Open(fname, PageSize, Pages, PagesOrBytes, direct); + if (ret) + ythrow yexception() << ErrorMessage(ret, "Failed to open output file", fname); + Name = fname; + } + + void Open(const TString& fname) { + Open(fname.data()); + } + + void OpenStream(TAutoPtr<IOutputStream> output) { + int ret = TBase::Open(output, PageSize, Pages, PagesOrBytes); + if (ret) + ythrow yexception() << ErrorMessage(ret, "Failed to open output stream", Name); + } + + void Close() { + int ret; + if ((ret = TBase::GetError())) + if (!std::uncaught_exception()) + ythrow yexception() << ErrorMessage(ret, "Error before closing output file", Name); + if ((ret = TBase::Close())) + if (!std::uncaught_exception()) + ythrow yexception() << ErrorMessage(ret, "Error while closing output file", Name); + } + + const char* GetName() const { + return Name.data(); + } + + using TBase::Freeze; + using TBase::GetError; + using TBase::GetPageSize; + using TBase::IsEof; + using TBase::IsOpen; + using TBase::Offset; + using TBase::Push; + using TBase::PushWithExtInfo; + using TBase::Reserve; + using TBase::Unfreeze; + +protected: + TString Name; + size_t PageSize, Pages; + int PagesOrBytes; +}; + +template <typename TVal, typename TCompressor, typename TPageFile> +class TOutDatFileArray; + +template <typename TVal, typename TCompressor = TFakeCompression, typename TPageFile = TOutputPageFile> +class TOutDatFileArray { + typedef TOutDatFile<TVal, TCompressor, TPageFile> TFileType; + +public: + TOutDatFileArray(const TString& name, size_t pagesize, size_t pages, int pagesOrBytes = 1) + : Name(name) + , PageSize(pagesize) + , Pages(pages) + , PagesOrBytes(pagesOrBytes) + , NumFiles(0) + , Files(nullptr) + { + } + + ~TOutDatFileArray() { + for (int i = 0; i < NumFiles; ++i) { + Files[i].Close(); + Files[i].~TFileType(); + } + free(Files); + Files = nullptr; + NumFiles = 0; + } + + TFileType& operator[](size_t pos) { + return Files[pos]; + } + + void Open(int n, const TString& fname) { + char temp[FILENAME_MAX]; + + Name = fname; + NumFiles = CreateDatObjects(n, fname); + + int i; + try { + for (i = 0; i < NumFiles; ++i) { + sprintf(temp, fname.data(), i); + Files[i].Open(temp); + } + } catch (...) { + while (--i >= 0) + Files[i].Close(); + throw; + } + } + + template <typename TNameBuilder> + void OpenWithCallback(int n, const TNameBuilder& builder) { + NumFiles = CreateDatObjects(n, Name); + + for (int i = 0; i < NumFiles; ++i) + Files[i].Open(builder.GetName(i).data()); + } + + void Close() { + for (int i = 0; i < NumFiles; ++i) + Files[i].Close(); + } + + void CloseMT(ui32 threads) { + int current = 0; + TMutex mutex; + TVector<std::thread> thrs; + thrs.reserve(threads); + for (ui32 i = 0; i < threads; i++) { + thrs.emplace_back([this, ¤t, &mutex]() { + while (true) { + mutex.Acquire(); + int cur = current++; + mutex.Release(); + if (cur >= NumFiles) + break; + Files[cur].Close(); + } + }); + } + for (auto& thread : thrs) { + thread.join(); + } + } + + const char* GetName() const { + return Name.data(); + } + +protected: + int CreateDatObjects(int n, const TString& fname) { + if (!(Files = (TFileType*)malloc(n * sizeof(TFileType)))) + ythrow yexception() << "can't alloc \"" << fname << "\" file array: " << LastSystemErrorText(); + int num = 0; + char temp[FILENAME_MAX]; + for (int i = 0; i < n; ++i, ++num) { + sprintf(temp, "%s[%d]", fname.data(), i); + new (Files + i) TFileType(temp, PageSize, Pages, PagesOrBytes); + } + return num; + } + + TString Name; + size_t PageSize, Pages; + int PagesOrBytes, NumFiles; + TFileType* Files; +}; + +template <typename TVal, typename TKey, typename TCompressor = TFakeCompression, typename TPageFile = TOutputPageFile> +class TOutDirectFile: protected TOutDirectFileImpl<TVal, TKey, TCompressor, TPageFile> { + typedef TOutDirectFileImpl<TVal, TKey, TCompressor, TPageFile> TBase; + +public: + TOutDirectFile(const TString& name, size_t pagesize, size_t pages, size_t ipagesize, size_t ipages, int pagesOrBytes) + : Name(name) + , PageSize(pagesize) + , Pages(pages) + , IdxPageSize(ipagesize) + , IdxPages(ipages) + , PagesOrBytes(pagesOrBytes) + { + } + + ~TOutDirectFile() { + Close(); + } + + void Open(const TString& fname) { + int ret = TBase::Open(fname.data(), PageSize, Pages, IdxPageSize, IdxPages, PagesOrBytes); + if (ret) + ythrow yexception() << ErrorMessage(ret, "Failed to open output file", fname); + Name = fname; + } + + void Close() { + int ret; + if ((ret = TBase::GetError())) + if (!std::uncaught_exception()) + ythrow yexception() << ErrorMessage(ret, "Error before closing output file", Name); + if ((ret = TBase::Close())) + if (!std::uncaught_exception()) + ythrow yexception() << ErrorMessage(ret, "Error while closing output file", Name); + } + + const char* GetName() const { + return Name.data(); + } + + using TBase::Freeze; + using TBase::Push; + using TBase::PushWithExtInfo; + using TBase::Reserve; + using TBase::Unfreeze; + +protected: + TString Name; + size_t PageSize, Pages, IdxPageSize, IdxPages; + int PagesOrBytes; +}; + +template < + typename TVal, + template <typename T> class TComparer, + typename TCompress = TFakeCompression, + typename TSieve = TFakeSieve<TVal>, + typename TPageFile = TOutputPageFile, + typename TFileTypes = TDefInterFileTypes> +class TDatSorter: protected TDatSorterImpl<TVal, TComparer<TVal>, TCompress, TSieve, TPageFile, TFileTypes> { + typedef TDatSorterImpl<TVal, TComparer<TVal>, TCompress, TSieve, TPageFile, TFileTypes> TBase; + +public: + typedef TVal TRec; + +public: + TDatSorter(const TString& name, size_t memory, size_t pagesize, size_t pages, int pagesOrBytes = 1) + : Name(name) + , Memory(memory) + , PageSize(pagesize) + , Pages(pages) + , PagesOrBytes(pagesOrBytes) + { + Templ[0] = 0; + } + + ~TDatSorter() { + Close(); + Templ[0] = 0; + } + + void Open(const TString& dirName) { + int ret; + if (ret = MakeSorterTempl(Templ, dirName.data())) { + Templ[0] = 0; + ythrow yexception() << ErrorMessage(ret, Name + " sorter: bad tempdir", dirName); + } + if ((ret = TBase::Open(Templ, PageSize, Pages, PagesOrBytes))) + ythrow yexception() << ErrorMessage(ret, Name + " sorter: open error, temp dir", Templ); + } + + void Sort(bool direct = false) { + int ret = TBase::Sort(Memory, 1000, direct); + if (ret) + ythrow yexception() << ErrorMessage(ret, Name + " sorter: sort error, temp dir", Templ, TVal::RecordSig); + } + + void SortToFile(const TString& name) { + int ret = TBase::SortToFile(name.data(), Memory); + if (ret) + ythrow yexception() << ErrorMessage(ret, Name + "sorter: error in SortToFile", name, TVal::RecordSig); + } + + void SortToStream(TAutoPtr<IOutputStream> output) { + int ret = TBase::SortToStream(output, Memory); + if (ret) + ythrow yexception() << ErrorMessage(ret, Name + "sorter: error in SortToStream", "", TVal::RecordSig); + } + + void Close() { + int ret1 = TBase::GetError(); + int ret2 = TBase::Close(); + if (Templ[0]) { + *strrchr(Templ, GetDirectorySeparator()) = 0; + RemoveDirWithContents(Templ); + Templ[0] = 0; + } + if (ret1) + if (!std::uncaught_exception()) + ythrow yexception() << ErrorMessage(ret1, Name + "sorter: error before closing"); + if (ret2) + if (!std::uncaught_exception()) + ythrow yexception() << ErrorMessage(ret2, Name + "sorter: error while closing"); + } + + int Sort(size_t memory, int maxportions, bool direct = false) { + return TBase::Sort(memory, maxportions, direct); + } + + const char* GetName() const { + return Name.data(); + } + + using TBase::GetPageSize; + using TBase::GetPages; + using TBase::Next; + using TBase::NextPortion; + using TBase::Push; + using TBase::PushWithExtInfo; + using TBase::UseSegmentSorter; + +protected: + TString Name; + size_t Memory, PageSize, Pages; + int PagesOrBytes; + char Templ[FILENAME_MAX]; +}; + +template <typename TSorter> +class TSorterArray { +public: + typedef TSorter TDatSorter; + +public: + TSorterArray(const TString& name, size_t memory, size_t pagesize, size_t pages, int pagesOrBytes = 1) + : Name(name) + , Memory(memory) + , PageSize(pagesize) + , Pages(pages) + , PagesOrBytes(pagesOrBytes) + , NumSorters(0) + , Sorters(nullptr) + { + } + + ~TSorterArray() { + for (int i = 0; i < NumSorters; ++i) { + Sorters[i].Close(); + Sorters[i].~TSorter(); + } + free(Sorters); + Sorters = nullptr; + NumSorters = 0; + } + + TSorter& operator[](size_t pos) { + return Sorters[pos]; + } + + void Open(int n, const TString& fname, size_t memory = 0) { + if (!(Sorters = (TSorter*)malloc(n * sizeof(TSorter)))) + ythrow yexception() << "can't alloc \"" << fname << "\" sorter array: " << LastSystemErrorText(); + NumSorters = n; + char temp[FILENAME_MAX]; + if (memory) + Memory = memory; + for (int i = 0; i < NumSorters; ++i) { + sprintf(temp, "%s[%d]", Name.data(), i); + new (Sorters + i) TSorter(temp, Memory, PageSize, Pages, PagesOrBytes); + } + for (int i = 0; i < NumSorters; ++i) + Sorters[i].Open(fname); + } + + void Close() { + for (int i = 0; i < NumSorters; ++i) + Sorters[i].Close(); + } + + const char* GetName() const { + return Name.data(); + } + +protected: + TString Name; + size_t Memory, PageSize, Pages; + int PagesOrBytes, NumSorters; + TSorter* Sorters; +}; + +template <typename TVal, template <typename T> class TCompare, typename TSieve = TFakeSieve<TVal>> +class TDatSorterArray: public TSorterArray<TDatSorter<TVal, TCompare, TSieve>> { +public: + TDatSorterArray(const char* name, size_t memory, size_t pagesize, size_t pages, int pagesOrBytes = 1) + : TSorterArray<TDatSorter<TVal, TCompare, TSieve>>(name, memory, pagesize, pages, pagesOrBytes) + { + } +}; + +template <typename TVal, template <typename T> class TCompare, typename TCompress = TFakeCompression, + typename TSieve = TFakeSieve<TVal>, typename TPageFile = TOutputPageFile, typename TFileTypes = TDefInterFileTypes> +class TDatSorterMemo: public TDatSorter<TVal, TCompare, TCompress, TSieve, TPageFile, TFileTypes> { + typedef TDatSorter<TVal, TCompare, TCompress, TSieve, TPageFile, TFileTypes> TSorter; + +public: + TOutDatFile<TVal> Memo; + TString Home; + bool OpenReq; + bool Opened; + bool UseDirectWrite; + +public: + TDatSorterMemo(const char* name, size_t memory, size_t pagesize, size_t pages, int pagesOrBytes = 1) + : TSorter(name, memory, pagesize, pages, pagesOrBytes) + , Memo(name, pagesize, memory, 0) + { + OpenReq = false; + Opened = false; + UseDirectWrite = false; + } + + void Open(const TString& home) { + OpenReq = true; + // TSorter::Open(home); + Home = home; + Memo.Open(nullptr); + Memo.Freeze(); + } + + void Reopen(const char* home) { + Close(); + Open(home); + } + + void Open() { + if (!OpenReq) { + OpenReq = true; + Memo.Open(nullptr); + Memo.Freeze(); + } + } + + void OpenIfNeeded() { + if (OpenReq && !Opened) { + if (!Home) + ythrow yexception() << "Temp directory not specified, call Open(char*) first : " << TSorter::Name; + TSorter::Open(Home); + Opened = true; + } + } + + TVal* Reserve(size_t len) { + if (TExtInfoType<TVal>::Exists) + return ReserveWithExt(len, 0); + + TVal* u = Memo.Reserve(len); + if (!u) { + OpenIfNeeded(); + TSorter::NextPortion(UseDirectWrite); + Memo.Freeze(); + u = Memo.Reserve(len); + } + TSorter::PushWithExtInfo(u); + return u; + } + + TVal* ReserveWithExt(size_t len, size_t extSize) { + size_t fullLen = len + len_long((i64)extSize) + extSize; + TVal* u = Memo.Reserve(fullLen); + if (!u) { + OpenIfNeeded(); + TSorter::NextPortion(UseDirectWrite); + Memo.Freeze(); + u = Memo.Reserve(fullLen); + if (!u) { + if (fullLen > Memo.GetPageSize()) { + ythrow yexception() << "Size of element and " << len << " size of extInfo " << extSize + << " is larger than page size " << Memo.GetPageSize(); + } + ythrow yexception() << "going to insert a null pointer. Bad."; + } + } + out_long((i64)extSize, (char*)u + len); + TSorter::PushWithExtInfo(u); + return u; + } + + char* GetReservedExt(TVal* rec, size_t len, size_t extSize) { + return (char*)rec + len + len_long((i64)extSize); + } + + const TVal* Push(const TVal* v, const typename TExtInfoType<TVal>::TResult* extInfo = nullptr) { + const TVal* u = Memo.Push(v, extInfo); + if (!u) { + OpenIfNeeded(); + TSorter::NextPortion(UseDirectWrite); + Memo.Freeze(); + u = Memo.Push(v, extInfo); + if (!u) { + if (SizeOf(v) > Memo.GetPageSize()) { + ythrow yexception() << "Size of element " << SizeOf(v) + << " is larger than page size " << Memo.GetPageSize(); + } + ythrow yexception() << "going to insert a null pointer. Bad."; + } + } + TSorter::PushWithExtInfo(u); + return u; + } + + const TVal* Push(const TVal* v, const ui8* extInfoRaw, size_t extLen) { + const TVal* u = Memo.Push(v, extInfoRaw, extLen); + if (!u) { + OpenIfNeeded(); + TSorter::NextPortion(UseDirectWrite); + Memo.Freeze(); + u = Memo.Push(v, extInfoRaw, extLen); + if (!u) { + if (SizeOf(v) > Memo.GetPageSize()) { + ythrow yexception() << "Size of element " << SizeOf(v) + << " is larger than page size " << Memo.GetPageSize(); + } + ythrow yexception() << "going to insert a null pointer. Bad.."; + } + } + TSorter::PushWithExtInfo(u); + return u; + } + + const TVal* PushWithExtInfo(const TVal* v) { + const TVal* u = Memo.PushWithExtInfo(v); + if (!u) { + OpenIfNeeded(); + TSorter::NextPortion(UseDirectWrite); + Memo.Freeze(); + u = Memo.PushWithExtInfo(v); + if (!u) { + if (SizeOf(v) > Memo.GetPageSize()) { + ythrow yexception() << "Size of element " << SizeOf(v) + << " is larger than page size " << Memo.GetPageSize(); + } + ythrow yexception() << "going to insert a null pointer. Bad..."; + } + } + TSorter::PushWithExtInfo(u); + return u; + } + + void Sort(bool direct = false) { + if (Opened) { + TSorter::NextPortion(UseDirectWrite); + Memo.Close(); + OpenReq = false; + TSorter::Sort(direct); + } else { + TSorter::SortPortion(); + } + } + + const TVal* Next() { + return Opened ? TSorter::Next() : TSorter::Nextp(); + } + + bool GetExtInfo(typename TExtInfoType<TVal>::TResult* extInfo) const { + return NMicroBDB::GetExtInfo(Current(), extInfo); + } + + const ui8* GetExtInfoRaw(size_t* len) const { + return NMicroBDB::GetExtInfoRaw(Current(), len); + } + + const TVal* Current() const { + return Opened ? TSorter::Current() : TSorter::Currentp(); + } + + int NextPortion() { + OpenIfNeeded(); + return TSorter::NextPortion(UseDirectWrite); + } + + void SortToFile(const char* name) { + OpenIfNeeded(); + TSorter::NextPortion(UseDirectWrite); + Memo.Close(); + OpenReq = false; + TSorter::SortToFile(name); + } + + void SortToStream(TAutoPtr<IOutputStream> output) { + OpenIfNeeded(); + TSorter::NextPortion(UseDirectWrite); + Memo.Close(); + OpenReq = false; + TSorter::SortToStream(output); + } + + template <typename TKey, typename TOutCompress> + void SortToDirectFile(const char* name, size_t ipagesize, size_t ipages) { + Sort(); + TOutDirectFile<TVal, TKey, TOutCompress> out(TSorter::Name, TSorter::PageSize, TSorter::Pages, ipagesize, ipages, TSorter::PagesOrBytes); + out.Open(name); + while (const TVal* rec = Next()) + out.PushWithExtInfo(rec); + out.Close(); + } + + template <typename TKey> + void SortToDirectFile(const char* name, size_t ipagesize, size_t ipages) { + SortToDirectFile<TKey, TCompress>(name, ipagesize, ipages); + } + + void CloseSorter() { + if (Opened) + TSorter::Close(); + else + TSorter::Closep(); + Memo.Freeze(); + Opened = false; + } + + void Close() { + if (Opened) + TSorter::Close(); + else + TSorter::Closep(); + Memo.Close(); + OpenReq = false; + Opened = false; + } + + int SavePortions(const char* mask) { + return TSorter::SavePortions(mask, UseDirectWrite); + } + +public: + using TSorter::RestorePortions; +}; + +template <typename TVal, template <typename T> class TCompare, typename TCompress = TFakeCompression, + typename TSieve = TFakeSieve<TVal>, class TPageFile = TOutputPageFile, class TFileTypes = TDefInterFileTypes> +class TDatSorterMemoArray: public TSorterArray<TDatSorterMemo<TVal, TCompare, TCompress, TSieve, TPageFile, TFileTypes>> { +public: + typedef TSorterArray<TDatSorterMemo<TVal, TCompare, TCompress, TSieve, TPageFile, TFileTypes>> TBase; + + TDatSorterMemoArray(const char* name, size_t memory, size_t pagesize, size_t pages, int pagesOrBytes = 1) + : TBase(name, memory, pagesize, pages, pagesOrBytes) + { + } +}; + +#if defined(_MSC_VER) +#pragma warning(pop) +#endif diff --git a/library/cpp/microbdb/sorter.h b/library/cpp/microbdb/sorter.h new file mode 100644 index 0000000000..b2e7390377 --- /dev/null +++ b/library/cpp/microbdb/sorter.h @@ -0,0 +1,677 @@ +#pragma once + +#include <util/ysaveload.h> +#include <util/generic/algorithm.h> +#include <contrib/libs/libc_compat/include/link/link.h> + +#include "header.h" +#include "heap.h" +#include "extinfo.h" +#include "input.h" +#include "output.h" + +#ifdef TEST_MERGE +#define MBDB_SORT_FUN ::StableSort +#else +#define MBDB_SORT_FUN ::Sort +#endif + +template <class TVal, class TCompare, typename TCompress, typename TSieve, typename TOutPageFile, typename TFileTypes> +class TDatSorterImpl; + +template <class TVal> +struct TFakeSieve { + static inline int Sieve(TVal*, const TVal*) noexcept { + return 0; + } +}; + +template <class TSieve> +struct TIsSieveFake { + static const bool Result = false; +}; + +template <class T> +struct TIsSieveFake<TFakeSieve<T>> { + static const bool Result = true; +}; + +class TDefInterFileTypes { +public: + typedef TOutputPageFile TOutPageFile; + typedef TInputPageFile TInPageFile; +}; + +//class TCompressedInterFileTypes; + +template <class TVal, class TCompare, typename TCompress, typename TSieve, typename TOutPageFile = TOutputPageFile, typename TFileTypes = TDefInterFileTypes> +class TDatSorterImplBase: protected THeapIter<TVal, TInDatFileImpl<TVal, TInputRecordIterator<TVal, TInputPageIterator<typename TFileTypes::TInPageFile>>>, TCompare> { + typedef TOutputRecordIterator<TVal, TOutputPageIterator<typename TFileTypes::TOutPageFile>, TFakeIndexer, TCompress> TTmpRecIter; + typedef TInputRecordIterator<TVal, TInputPageIterator<typename TFileTypes::TInPageFile>> TInTmpRecIter; + +public: + typedef TOutDatFileImpl<TVal, TTmpRecIter> TTmpOut; + typedef TInDatFileImpl<TVal, TInTmpRecIter> TTmpIn; + + typedef TOutDatFileImpl<TVal, TOutputRecordIterator<TVal, TOutputPageIterator<TOutPageFile>, TFakeIndexer, TCompress>> TOut; + typedef THeapIter<TVal, TTmpIn, TCompare> TMyHeap; + typedef TVector<const TVal*> TMyVector; + typedef typename TMyVector::iterator TMyIterator; + + class IPortionSorter { + public: + virtual ~IPortionSorter() { + } + + virtual void Sort(TMyVector&, TTmpOut*) = 0; + }; + + class TDefaultSorter: public IPortionSorter { + public: + void Sort(TMyVector& vector, TTmpOut* out) override { + MBDB_SORT_FUN(vector.begin(), vector.end(), TCompare()); + + const typename TMyVector::const_iterator + end = (TIsSieveFake<TSieve>::Result) ? vector.end() : TDatSorterImplBase::SieveRange(vector.begin(), vector.end()); + + for (typename TMyVector::const_iterator it = vector.begin(); it != end; ++it) { + out->PushWithExtInfo(*it); + } + } + }; + + class TSegmentedSorter: public IPortionSorter { + class TAdaptor { + typedef typename TMyVector::const_iterator TConstIterator; + + public: + TAdaptor(TConstIterator b, TConstIterator e) + : Curr_(b) + , End_(e) + { + --Curr_; + } + + inline const TVal* Current() const { + return *Curr_; + } + + inline const TVal* Next() { + ++Curr_; + + if (Curr_ == End_) { + return nullptr; + } + + return *Curr_; + } + + private: + TConstIterator Curr_; + TConstIterator End_; + }; + + typedef THeapIter<TVal, TAdaptor, TCompare> TPortionsHeap; + + public: + void Sort(TMyVector& vector, TTmpOut* out) override { + TVector<TAdaptor> bounds; + typename TMyVector::iterator + it = vector.begin(); + const size_t portions = Max<size_t>(1, (vector.size() * sizeof(TVal)) / (4 << 20)); + const size_t step = vector.size() / portions; + + // Sort segments + while (it != vector.end()) { + const typename TMyVector::iterator + end = Min(it + step, vector.end()); + + MBDB_SORT_FUN(it, end, TCompare()); + + bounds.push_back(TAdaptor(it, end)); + + it = end; + } + + // + // Merge result + // + + TPortionsHeap heap(bounds); + + if (TIsSieveFake<TSieve>::Result) { + while (const TVal* val = heap.Next()) { + out->PushWithExtInfo(val); + } + } else { + const TVal* val = heap.Next(); + const TVal* prev = out->PushWithExtInfo(val); + + for (val = heap.Next(); val && prev; val = heap.Next()) { + if (TSieve::Sieve((TVal*)prev, val)) { + continue; + } + + prev = out->PushWithExtInfo(val); + } + + if (prev) { + TSieve::Sieve((TVal*)prev, prev); + } + } + } + }; + +public: + TDatSorterImplBase() + : Sorter(new TDefaultSorter) + { + InFiles = nullptr; + TempBuf = nullptr; + Ptr = Vector.end(); + Cur = nullptr; + Portions = CPortions = Error = 0; + } + + ~TDatSorterImplBase() { + Close(); + } + + int Open(const char* templ, size_t pagesize, size_t pages, int pagesOrBytes = 1) { + Portions = CPortions = Error = 0; + TempBuf = strdup(templ); + Pagesize = pagesize; + if (pagesOrBytes) + Pages = pages; + else + Pages = pages / pagesize; + Pages = Max(1, Pages); + return 0; + } + + void Push(const TVal* v) { + // Serialized extInfo must follow a record being pushed, therefore, to avoid + // unintentional misusage (as if when you are adding TExtInfo in your record + // type: you may forget to check your sorting routines and get a segfault as + // a result). + // PushWithExtInfo(v) should be called on records with extInfo. + static_assert(!TExtInfoType<TVal>::Exists, "expect !TExtInfoType<TVal>::Exists"); + + Vector.push_back(v); + } + + void PushWithExtInfo(const TVal* v) { + Vector.push_back(v); + } + + int SortPortion() { + Ptr = Vector.end(); + Cur = nullptr; + if (!Vector.size() || Error) + return Error; + + MBDB_SORT_FUN(Vector.begin(), Vector.end(), TCompare()); + + if (!TIsSieveFake<TSieve>::Result) { + const typename TMyVector::iterator + end = SieveRange(Vector.begin(), Vector.end()); + + Vector.resize(end - Vector.begin()); + } + + Ptr = Vector.begin(); + Cur = nullptr; + return 0; + } + + const TVal* Nextp() { + Cur = Ptr == Vector.end() ? nullptr : *Ptr++; + return Cur; + } + + const TVal* Currentp() const { + return Cur; + } + + void Closep() { + Vector.clear(); + Ptr = Vector.end(); + Cur = nullptr; + } + + int NextPortion(bool direct = false) { + if (!Vector.size() || Error) + return Error; + + TTmpOut out; + int ret, ret1; + char fname[FILENAME_MAX]; + + snprintf(fname, sizeof(fname), TempBuf, Portions++); + if ((ret = out.Open(fname, Pagesize, Pages, 1, direct))) + return Error = ret; + + Sorter->Sort(Vector, &out); + + Vector.erase(Vector.begin(), Vector.end()); + ret = out.GetError(); + ret1 = out.Close(); + Error = Error ? Error : ret ? ret : ret1; + if (Error) + unlink(fname); + return Error; + } + + int SavePortions(const char* mask, bool direct = false) { + char srcname[PATH_MAX], dstname[PATH_MAX]; + if (Vector.size()) + NextPortion(direct); + for (int i = 0; i < Portions; i++) { + char num[10]; + sprintf(num, "%i", i); + snprintf(srcname, sizeof(srcname), TempBuf, i); + snprintf(dstname, sizeof(dstname), mask, num); + int res = rename(srcname, dstname); + if (res) + return res; + } + snprintf(dstname, sizeof(dstname), mask, "count"); + TOFStream fcount(dstname); + Save(&fcount, Portions); + fcount.Finish(); + return 0; + } + + int RestorePortions(const char* mask) { + char srcname[PATH_MAX], dstname[PATH_MAX]; + snprintf(srcname, sizeof(srcname), mask, "count"); + TIFStream fcount(srcname); + Load(&fcount, Portions); + for (int i = 0; i < Portions; i++) { + char num[10]; + sprintf(num, "%i", i); + snprintf(dstname, sizeof(dstname), TempBuf, i); + snprintf(srcname, sizeof(srcname), mask, num); + unlink(dstname); + int res = link(srcname, dstname); + if (res) + return res; + } + return 0; + } + + int RestorePortions(const char* mask, ui32 count) { + char srcname[PATH_MAX], dstname[PATH_MAX]; + ui32 portions; + TVector<ui32> counts; + for (ui32 j = 0; j < count; j++) { + snprintf(srcname, sizeof(srcname), mask, j, "count"); + TIFStream fcount(srcname); + Load(&fcount, portions); + counts.push_back(portions); + Portions += portions; + } + ui32 p = 0; + for (ui32 j = 0; j < count; j++) { + int cnt = counts[j]; + for (int i = 0; i < cnt; i++, p++) { + char num[10]; + sprintf(num, "%i", i); + snprintf(dstname, sizeof(dstname), TempBuf, p); + snprintf(srcname, sizeof(srcname), mask, j, num); + unlink(dstname); + int res = link(srcname, dstname); + if (res) { + fprintf(stderr, "Can not link %s to %s\n", srcname, dstname); + return res; + } + } + } + return 0; + } + + int Sort(size_t memory, int maxportions = 1000, bool direct = false) { + int ret, end, beg, i; + char fname[FILENAME_MAX]; + + if (Vector.size()) + NextPortion(); + + if (Error) + return Error; + if (!Portions) { + TMyHeap::Init(&DummyFile, 1); // closed file + HPages = 1; + return 0; + } + + Optimize(memory, maxportions); + if (!(InFiles = new TTmpIn[MPortions])) + return MBDB_NO_MEMORY; + + for (beg = 0; beg < Portions && !Error; beg = end) { + end = (int)Min(beg + FPortions, Portions); + for (i = beg; i < end && !Error; i++) { + snprintf(fname, sizeof(fname), TempBuf, i); + if ((ret = InFiles[i - beg].Open(fname, HPages, 1, nullptr, direct))) + Error = Error ? Error : ret; + } + if (Error) + return Error; + TMyHeap::Init(InFiles, end - beg); + if (end != Portions) { + TTmpOut out; + const TVal* v; + snprintf(fname, sizeof(fname), TempBuf, Portions++); + if ((ret = out.Open(fname, Pagesize, HPages))) + return Error = Error ? Error : ret; + while ((v = TMyHeap::Next())) + out.PushWithExtInfo(v); + ret = out.GetError(); + Error = Error ? Error : ret; + ret = out.Close(); + Error = Error ? Error : ret; + for (i = beg; i < end; i++) { + ret = InFiles[i - beg].Close(); + Error = Error ? Error : ret; + snprintf(fname, sizeof(fname), TempBuf, CPortions++); + unlink(fname); + } + } + FPortions = MPortions; + } + return Error; + } + + int Close() { + char fname[FILENAME_MAX]; + delete[] InFiles; + InFiles = nullptr; + Closep(); + for (int i = CPortions; i < Portions; i++) { + snprintf(fname, sizeof(fname), TempBuf, i); + unlink(fname); + } + CPortions = Portions = 0; + free(TempBuf); + TempBuf = nullptr; + return Error; + } + + void UseSegmentSorter() { + Sorter.Reset(new TSegmentedSorter); + } + + inline int GetError() const { + return Error; + } + + inline int GetPages() const { + return Pages; + } + + inline int GetPageSize() const { + return Pagesize; + } + +private: + static TMyIterator SieveRange(const TMyIterator begin, const TMyIterator end) { + TMyIterator it = begin; + TMyIterator prev = begin; + + for (++it; it != end; ++it) { + if (TSieve::Sieve((TVal*)*prev, *it)) { + continue; + } + + ++prev; + + if (it != prev) { + *prev = *it; + } + } + + TSieve::Sieve((TVal*)*prev, *prev); + + return ++prev; + } + +protected: + void Optimize(size_t memory, int maxportions, size_t fbufmax = 256u << 20) { + maxportions = (int)Min((size_t)maxportions, memory / Pagesize) - 1; + size_t maxpages = Max((size_t)1u, fbufmax / Pagesize); + + if (maxportions <= 2) { + FPortions = MPortions = 2; + HPages = 1; + return; + } + if (maxportions >= Portions) { + FPortions = MPortions = Portions; + HPages = (int)Min(memory / ((Portions + 1) * Pagesize), maxpages); + return; + } + if (((Portions + maxportions - 1) / maxportions) <= maxportions) { + while (((Portions + maxportions - 1) / maxportions) <= maxportions) + --maxportions; + MPortions = ++maxportions; + int total = ((Portions + maxportions - 1) / maxportions) + Portions; + FPortions = (total % maxportions) ? (total % maxportions) : MPortions; + HPages = (int)Min(memory / ((MPortions + 1) * Pagesize), maxpages); + return; + } + FPortions = MPortions = maxportions; + HPages = (int)Min(memory / ((MPortions + 1) * Pagesize), maxpages); + } + + TMyVector Vector; + typename TMyVector::iterator Ptr; + const TVal* Cur; + TTmpIn *InFiles, DummyFile; + char* TempBuf; + int Portions, CPortions, Pagesize, Pages, Error; + int FPortions, MPortions, HPages; + THolder<IPortionSorter> Sorter; +}; + +template <class TVal, class TCompare, typename TCompress> +class TDatSorterImpl<TVal, TCompare, TCompress, TFakeSieve<TVal>, TOutputPageFile, TDefInterFileTypes> + : public TDatSorterImplBase<TVal, TCompare, TCompress, TFakeSieve<TVal>, TOutputPageFile, TDefInterFileTypes> { + typedef TDatSorterImplBase<TVal, TCompare, TCompress, TFakeSieve<TVal>, TOutputPageFile, TDefInterFileTypes> TBase; + +public: + int SortToFile(const char* name, size_t memory, int maxportions = 1000) { + int ret = TBase::Sort(memory, maxportions); + if (ret) + return ret; + typename TBase::TOut out; + if ((ret = out.Open(name, TBase::Pagesize, TBase::HPages))) + return ret; + const TVal* rec; + while ((rec = Next())) + out.PushWithExtInfo(rec); + if ((ret = out.GetError())) + return ret; + if ((ret = out.Close())) + return ret; + if ((ret = TBase::Close())) + return ret; + return 0; + } + + int SortToStream(TAutoPtr<IOutputStream> output, size_t memory, int maxportions = 1000) { + int ret = TBase::Sort(memory, maxportions); + if (ret) + return ret; + typename TBase::TOut out; + if ((ret = out.Open(output, TBase::Pagesize, TBase::HPages))) + return ret; + const TVal* rec; + while ((rec = Next())) + out.PushWithExtInfo(rec); + if ((ret = out.GetError())) + return ret; + if ((ret = out.Close())) + return ret; + if ((ret = TBase::Close())) + return ret; + return 0; + } + + const TVal* Next() { + return TBase::TMyHeap::Next(); + } + + const TVal* Current() const { + return TBase::TMyHeap::Current(); + } + + bool GetExtInfo(typename TExtInfoType<TVal>::TResult* extInfo) const { + return TBase::TMyHeap::GetExtInfo(extInfo); + } + + const ui8* GetExtInfoRaw(size_t* len) const { + return TBase::TMyHeap::GetExtInfoRaw(len); + } +}; + +template <class TVal, class TCompare, typename TCompress, typename TSieve, + typename TOutPageFile = TOutputPageFile, typename TFileTypes = TDefInterFileTypes> +class TDatSorterImpl: public TDatSorterImplBase<TVal, TCompare, TCompress, TSieve, TOutPageFile, TFileTypes> { + typedef TDatSorterImplBase<TVal, TCompare, TCompress, TSieve, TOutPageFile, TFileTypes> TBase; + +public: + TDatSorterImpl() + : Cur(nullptr) + , Prev(nullptr) + { + } + + int SortToFile(const char* name, size_t memory, int maxportions = 1000) { + int ret = Sort(memory, maxportions); + if (ret) + return ret; + typename TBase::TOut out; + if ((ret = out.Open(name, TBase::Pagesize, TBase::HPages))) + return ret; + const TVal* rec; + while ((rec = Next())) + out.PushWithExtInfo(rec); + if ((ret = out.GetError())) + return ret; + if ((ret = out.Close())) + return ret; + if ((ret = TBase::Close())) + return ret; + return 0; + } + + int SortToStream(TAutoPtr<IOutputStream> output, size_t memory, int maxportions = 1000) { + int ret = Sort(memory, maxportions); + if (ret) + return ret; + typename TBase::TOut out; + if ((ret = out.Open(output, TBase::Pagesize, TBase::HPages))) + return ret; + const TVal* rec; + while ((rec = Next())) + out.PushWithExtInfo(rec); + if ((ret = out.GetError())) + return ret; + if ((ret = out.Close())) + return ret; + if ((ret = TBase::Close())) + return ret; + return 0; + } + + int Open(const char* templ, size_t pagesize, size_t pages, int pagesOrBytes = 1) { + int res = TBase::Open(templ, pagesize, pages, pagesOrBytes); + Prev = nullptr; + Cur = nullptr; + return res; + } + + int Sort(size_t memory, int maxportions = 1000, bool direct = false) { + int res = TBase::Sort(memory, maxportions, direct); + if (!res) { + const TVal* rec = TBase::TMyHeap::Next(); + if (rec) { + size_t els, es; + size_t sz = NMicroBDB::SizeOfExt(rec, &els, &es); + sz += els + es; + if (!TExtInfoType<TVal>::Exists) + Cur = (TVal*)malloc(sizeof(TVal)); + else + Cur = (TVal*)malloc(TBase::Pagesize); + memcpy(Cur, rec, sz); + } + } + return res; + } + + // Prev = last returned + // Cur = current accumlating with TSieve + + const TVal* Next() { + if (!Cur) { + if (Prev) { + free(Prev); + Prev = nullptr; + } + return nullptr; + } + const TVal* rec; + + if (TIsSieveFake<TSieve>::Result) + rec = TBase::TMyHeap::Next(); + else { + do { + rec = TBase::TMyHeap::Next(); + } while (rec && TSieve::Sieve((TVal*)Cur, rec)); + } + + if (!Prev) { + if (!TExtInfoType<TVal>::Exists) + Prev = (TVal*)malloc(sizeof(TVal)); + else + Prev = (TVal*)malloc(TBase::Pagesize); + } + size_t els, es; + size_t sz = NMicroBDB::SizeOfExt(Cur, &els, &es); + sz += els + es; + memcpy(Prev, Cur, sz); + + if (rec) { + sz = NMicroBDB::SizeOfExt(rec, &els, &es); + sz += els + es; + memcpy(Cur, rec, sz); + } else { + TSieve::Sieve((TVal*)Cur, Cur); + free(Cur); + Cur = nullptr; + } + return Prev; + } + + const TVal* Current() const { + return Prev; + } + + int Close() { + int res = TBase::Close(); + if (Prev) { + free(Prev); + Prev = nullptr; + } + if (Cur) { + free(Cur); + Cur = nullptr; + } + return res; + } + +protected: + TVal* Cur; + TVal* Prev; +}; diff --git a/library/cpp/microbdb/sorterdef.h b/library/cpp/microbdb/sorterdef.h new file mode 100644 index 0000000000..8834b5fff8 --- /dev/null +++ b/library/cpp/microbdb/sorterdef.h @@ -0,0 +1,19 @@ +#pragma once + +#define MAKESORTERTMPL(TRecord, MemberFunc) \ + template <typename T> \ + struct MemberFunc; \ + template <> \ + struct MemberFunc<TRecord> { \ + bool operator()(const TRecord* l, const TRecord* r) { \ + return TRecord ::MemberFunc(l, r) < 0; \ + } \ + int operator()(const TRecord* l, const TRecord* r, int) { \ + return TRecord ::MemberFunc(l, r); \ + } \ + } + +template <typename T> +static inline int compare(const T& a, const T& b) { + return (a < b) ? -1 : (a > b); +} diff --git a/library/cpp/microbdb/utility.h b/library/cpp/microbdb/utility.h new file mode 100644 index 0000000000..5c86061bca --- /dev/null +++ b/library/cpp/microbdb/utility.h @@ -0,0 +1,75 @@ +#pragma once + +#include "microbdb.h" + +template <class TRecord, template <class T> class TCompare> +int SortData(const TFile& ifile, const TFile& ofile, const TDatMetaPage* meta, size_t memory, const char* tmpDir = nullptr) { + char templ[FILENAME_MAX]; + TInDatFileImpl<TRecord> datin; + TOutDatFileImpl<TRecord> datout; + TDatSorterImpl<TRecord, TCompare<TRecord>, TFakeCompression, TFakeSieve<TRecord>> sorter; + const TRecord* u; + int ret; + + const size_t minMemory = (2u << 20); + memory = Max(memory, minMemory + minMemory / 2); + if (datin.Open(ifile, meta, memory - minMemory, 0)) + err(1, "can't read input file"); + + size_t outpages = Max((size_t)2u, minMemory / datin.GetPageSize()); + memory -= outpages * datin.GetPageSize(); + + if (ret = MakeSorterTempl(templ, tmpDir)) + err(1, "can't create tempdir in \"%s\"; error: %d\n", templ, ret); + + if (sorter.Open(templ, datin.GetPageSize(), outpages)) { + *strrchr(templ, LOCSLASH_C) = 0; + RemoveDirWithContents(templ); + err(1, "can't open sorter"); + } + + while (1) { + datin.Freeze(); + while ((u = datin.Next())) + sorter.PushWithExtInfo(u); + sorter.NextPortion(); + if (datin.GetError() || datin.IsEof()) + break; + } + + if (datin.GetError()) { + *strrchr(templ, LOCSLASH_C) = 0; + RemoveDirWithContents(templ); + err(1, "in data file error %d", datin.GetError()); + } + if (datin.Close()) { + *strrchr(templ, LOCSLASH_C) = 0; + RemoveDirWithContents(templ); + err(1, "can't close in data file"); + } + + sorter.Sort(memory); + + if (datout.Open(ofile, datin.GetPageSize(), outpages)) { + *strrchr(templ, LOCSLASH_C) = 0; + RemoveDirWithContents(templ); + err(1, "can't write out file"); + } + + while ((u = sorter.Next())) + datout.PushWithExtInfo(u); + + if (sorter.GetError()) + err(1, "sorter error %d", sorter.GetError()); + if (sorter.Close()) + err(1, "can't close sorter"); + + *strrchr(templ, LOCSLASH_C) = 0; + RemoveDirWithContents(templ); + + if (datout.GetError()) + err(1, "out data file error %d", datout.GetError()); + if (datout.Close()) + err(1, "can't close out data file"); + return 0; +} diff --git a/library/cpp/microbdb/wrappers.h b/library/cpp/microbdb/wrappers.h new file mode 100644 index 0000000000..38eb8edebc --- /dev/null +++ b/library/cpp/microbdb/wrappers.h @@ -0,0 +1,637 @@ +#pragma once + +#include "microbdb.h" + +#define MAKEFILTERTMPL(TRecord, MemberFunc, NS) \ + template <typename T> \ + struct MemberFunc; \ + template <> \ + struct MemberFunc<TRecord> { \ + bool operator()(const TRecord* r) { \ + return NS::MemberFunc(r); \ + } \ + } + +#define MAKEJOINTMPL(TRecordA, TRecordB, MemberFunc, NS, TMergeType) \ + template <typename A, typename B> \ + struct MemberFunc; \ + template <> \ + struct MemberFunc<TRecordA, TRecordB> { \ + int operator()(const TRecordA* l, const TRecordB* r) { \ + return NS::MemberFunc(l, r); \ + } \ + }; \ + typedef TMergeRec<TRecordA, TRecordB> TMergeType + +#define MAKEJOINTMPL2(TRecordA, TRecordB, MemberFunc, StructName, TMergeType) \ + template <typename A, typename B> \ + struct StructName; \ + template <> \ + struct StructName<TRecordA, TRecordB> { \ + int operator()(const TRecordA* l, const TRecordB* r) { \ + return MemberFunc(l, r); \ + } \ + }; \ + typedef TMergeRec<TRecordA, TRecordB> TMergeType + +#define MAKEJOINTMPLLEFT(TRecordA, TRecordB, MemberFunc, NS, TMergeType) \ + template <typename A, typename B> \ + struct MemberFunc; \ + template <> \ + struct MemberFunc<TRecordA, TRecordB> { \ + int operator()(const TRecordA* l, const TRecordB* r) { \ + return NS::MemberFunc(l->RecA, r); \ + } \ + }; \ + typedef TMergeRec<TRecordA, TRecordB> TMergeType + +template <class TRec> +class IDatNextSource { +public: + virtual const TRec* Next() = 0; + virtual void Work() { + } +}; + +template <class TRec> +class IDatNextReceiver { +public: + IDatNextReceiver(IDatNextSource<TRec>& source) + : Source(source) + { + } + + virtual void Work() { + Source.Work(); + } + +protected: + IDatNextSource<TRec>& Source; +}; + +template <class TInRec, class TOutRec> +class IDatNextChannel: public IDatNextReceiver<TInRec>, public IDatNextSource<TOutRec> { +public: + IDatNextChannel(IDatNextSource<TInRec>& source) + : IDatNextReceiver<TInRec>(source) + { + } + + virtual void Work() { + IDatNextReceiver<TInRec>::Work(); + } +}; + +class IDatWorker { +public: + virtual void Work() = 0; +}; + +template <class TRec> +class IDatPushReceiver { +public: + virtual void Push(const TRec* rec) = 0; + virtual void Work() = 0; +}; + +template <class TRec> +class IDatPushSource { +public: + IDatPushSource(IDatPushReceiver<TRec>& receiver) + : Receiver(receiver) + { + } + + virtual void Work() { + Receiver.Work(); + } + +protected: + IDatPushReceiver<TRec>& Receiver; +}; + +template <class TInRec, class TOutRec> +class IDatPushChannel: public IDatPushReceiver<TInRec>, public IDatPushSource<TOutRec> { +public: + IDatPushChannel(IDatPushReceiver<TOutRec>& receiver) + : IDatPushSource<TOutRec>(receiver) + { + } + + virtual void Work() { + IDatPushSource<TOutRec>::Work(); + } +}; + +template <class TRec> +class IDatNextToPush: public IDatNextReceiver<TRec>, public IDatPushSource<TRec> { + typedef IDatNextReceiver<TRec> TNextReceiver; + typedef IDatPushSource<TRec> TPushSource; + +public: + IDatNextToPush(IDatNextSource<TRec>& source, IDatPushReceiver<TRec>& receiver) + : TNextReceiver(source) + , TPushSource(receiver) + { + } + + virtual void Work() { + const TRec* rec; + while (rec = TNextReceiver::Source.Next()) + TPushSource::Receiver.Push(rec); + TPushSource::Work(); + TNextReceiver::Work(); + } +}; + +template <class TRec> +class TDatNextPNSplitter: public IDatNextReceiver<TRec>, public IDatNextSource<TRec>, public IDatPushSource<TRec> { +public: + TDatNextPNSplitter(IDatNextSource<TRec>& source, IDatPushReceiver<TRec>& receiver) + : IDatNextReceiver<TRec>(source) + , IDatNextSource<TRec>() + , IDatPushSource<TRec>(receiver) + { + } + + const TRec* Next() { + const TRec* rec = IDatNextReceiver<TRec>::Source.Next(); + if (rec) { + IDatPushSource<TRec>::Receiver.Push(rec); + return rec; + } else { + return 0; + } + } + + virtual void Work() { + IDatNextReceiver<TRec>::Work(); + IDatPushSource<TRec>::Work(); + } +}; + +template <class TRec, class TOutRecA = TRec, class TOutRecB = TRec> +class TDatPushPPSplitter: public IDatPushReceiver<TRec>, public IDatPushSource<TOutRecA>, public IDatPushSource<TOutRecB> { +public: + TDatPushPPSplitter(IDatPushReceiver<TOutRecA>& receiverA, IDatPushReceiver<TOutRecB>& receiverB) + : IDatPushSource<TOutRecA>(receiverA) + , IDatPushSource<TOutRecB>(receiverB) + { + } + + void Push(const TRec* rec) { + IDatPushSource<TOutRecA>::Receiver.Push(rec); + IDatPushSource<TOutRecB>::Receiver.Push(rec); + } + + void Work() { + IDatPushSource<TOutRecA>::Work(); + IDatPushSource<TOutRecB>::Work(); + } +}; + +template <class TRec> +class TFastInDatFile: public TInDatFile<TRec>, public IDatNextSource<TRec> { +public: + typedef TInDatFile<TRec> Base; + + TFastInDatFile(const char* name, bool open = true, size_t pages = dbcfg::fbufsize, int pagesOrBytes = 0) + : TInDatFile<TRec>(name, pages, pagesOrBytes) + , FileName(name) + { + if (open) + Base::Open(name); + } + + void Open() { + Base::Open(FileName); + } + + template <class TPassRec> + bool PassToUid(const TRec* inrec, const TPassRec* torec) { + inrec = Base::Current(); + while (inrec && CompareUids(inrec, torec) < 0) + inrec = Base::Next(); + return (inrec && CompareUids(inrec, torec) == 0); + } + + void Work() { + Base::Close(); + } + + const TRec* Next() { + return Base::Next(); + } + +private: + TString FileName; +}; + +template <class TRec> +class TPushOutDatFile: public TOutDatFile<TRec>, public IDatPushReceiver<TRec> { +public: + typedef TOutDatFile<TRec> Base; + + TPushOutDatFile(const char* name, bool open = true) + : Base(name, dbcfg::pg_docuid, dbcfg::fbufsize, 0) + , FileName(name) + { + if (open) + Base::Open(name); + } + + void Open() { + Base::Open(~FileName); + } + + void Push(const TRec* rec) { + Base::Push(rec); + } + + void Work() { + Base::Close(); + } + +private: + TString FileName; +}; + +template <class TRec> +class TNextOutDatFile: public IDatNextToPush<TRec> { +public: + typedef IDatNextToPush<TRec> TBase; + + TNextOutDatFile(const char* name, IDatNextSource<TRec>& source, bool open = true) + : TBase(source, File) + , File(name, open) + { + } + + void Open() { + File.Open(); + } + +private: + TPushOutDatFile<TRec> File; +}; + +template <class TVal, template <typename T> class TCompare> +class TNextDatSorterMemo: public TDatSorterMemo<TVal, TCompare>, public IDatNextChannel<TVal, TVal> { + typedef TDatSorterMemo<TVal, TCompare> TImpl; + +public: + TNextDatSorterMemo(IDatNextSource<TVal>& source, const char* dir = dbcfg::fname_temp, const char* name = "yet another sorter", size_t memory = dbcfg::small_sorter_size, size_t pagesize = dbcfg::pg_docuid, size_t pages = dbcfg::fbufsize, int pagesOrBytes = 0) + : TImpl(name, memory, pagesize, pages, pagesOrBytes) + , IDatNextChannel<TVal, TVal>(source) + , Sorted(false) + { + TImpl::Open(dir); + } + + void Sort() { + const TVal* rec; + while (rec = IDatNextChannel<TVal, TVal>::Source.Next()) { + TImpl::Push(rec); + } + TImpl::Sort(); + Sorted = true; + } + + const TVal* Next() { + if (!Sorted) + Sort(); + return TImpl::Next(); + } + +private: + bool Sorted; + TString Dir; +}; + +template <class TInRec, class TOutRec> +class TDatConverter: public IDatNextChannel<TInRec, TOutRec> { +public: + TDatConverter(IDatNextSource<TInRec>& source) + : IDatNextChannel<TInRec, TOutRec>(source) + { + } + + virtual void Convert(const TInRec& inrec, TOutRec& outrec) { + outrec(inrec); + } + + const TOutRec* Next() { + const TInRec* rec = IDatNextChannel<TInRec, TOutRec>::Source.Next(); + if (!rec) + return 0; + Convert(*rec, CurrentRec); + return &CurrentRec; + } + +private: + TOutRec CurrentRec; +}; + +template <class TRecA, class TRecB> +class TMergeRec { +public: + const TRecA* RecA; + const TRecB* RecB; +}; + +enum NMergeTypes { + MT_JOIN = 0, + MT_ADD = 1, + MT_OVERWRITE = 2, + MT_TYPENUM +}; + +template <class TRecA, class TRecB, template <typename TA, typename TB> class TCompare> +class TNextDatMerger: public IDatNextReceiver<TRecA>, public IDatNextReceiver<TRecB>, public IDatNextSource<TMergeRec<TRecA, TRecB>> { +public: + TNextDatMerger(IDatNextSource<TRecA>& sourceA, IDatNextSource<TRecB>& sourceB, ui8 mergeType) + : IDatNextReceiver<TRecA>(sourceA) + , IDatNextReceiver<TRecB>(sourceB) + , MergeType(mergeType) + , MoveA(false) + , MoveB(false) + , NotInit(true) + { + } + + const TMergeRec<TRecA, TRecB>* Next() { + if (MoveA || NotInit) + SourceARec = IDatNextReceiver<TRecA>::Source.Next(); + if (MoveB || NotInit) + SourceBRec = IDatNextReceiver<TRecB>::Source.Next(); + NotInit = false; + + // Cout << "Next " << SourceARec->HostId << "\t" << SourceBRec->HostId << "\t" << TCompare<TRecA, TRecB>()(SourceARec, SourceBRec) << "\t" << ::compare(SourceARec->HostId, SourceBRec->HostId) << "\t" << ::compare(1, 2) << "\t" << ::compare(2,1) << Endl; + if (MergeType == MT_ADD && SourceARec && (!SourceBRec || TCompare<TRecA, TRecB>()(SourceARec, SourceBRec) < 0)) { + MergeRec.RecA = SourceARec; + MergeRec.RecB = 0; + MoveA = true; + MoveB = false; + return &MergeRec; + } + + if (MergeType == MT_ADD && SourceBRec && (!SourceARec || TCompare<TRecA, TRecB>()(SourceARec, SourceBRec) < 0)) { + MergeRec.RecA = 0; + MergeRec.RecB = SourceBRec; + MoveA = false; + MoveB = true; + return &MergeRec; + } + + if (MergeType == MT_ADD && SourceARec && SourceBRec && TCompare<TRecA, TRecB>()(SourceARec, SourceBRec) == 0) { + MergeRec.RecA = SourceARec; + MergeRec.RecB = SourceBRec; + MoveA = true; + MoveB = true; + return &MergeRec; + } + + while (MergeType == MT_JOIN && SourceARec && SourceBRec && TCompare<TRecA, TRecB>()(SourceARec, SourceBRec) != 0) { + while (SourceARec && TCompare<TRecA, TRecB>()(SourceARec, SourceBRec) < 0) { + SourceARec = IDatNextReceiver<TRecA>::Source.Next(); + } + while (SourceARec && SourceBRec && TCompare<TRecA, TRecB>()(SourceARec, SourceBRec) > 0) { + SourceBRec = IDatNextReceiver<TRecB>::Source.Next(); + } + } + + if (MergeType == MT_JOIN && SourceARec && SourceBRec) { + MergeRec.RecA = SourceARec; + MergeRec.RecB = SourceBRec; + MoveA = true; + MoveB = true; + return &MergeRec; + } + + MergeRec.RecA = 0; + MergeRec.RecB = 0; + return 0; + } + + void Work() { + IDatNextReceiver<TRecA>::Source.Work(); + IDatNextReceiver<TRecB>::Source.Work(); + } + +private: + TMergeRec<TRecA, TRecB> MergeRec; + const TRecA* SourceARec; + const TRecB* SourceBRec; + ui8 MergeType; + bool MoveA; + bool MoveB; + bool NotInit; +}; + +/*template<class TRec, class TSource, template <typename T> class TCompare, class TReceiver = TPushOutDatFile<TRec> > +class TPushDatMerger { +public: + TPushDatMerger(TSource& source, TReceiver& receiver, ui8 mergeType) + : Source(source) + , Receiver(receiver) + , MergeType(mergeType) + { + } + + virtual void Init() { + SourceRec = Source.Next(); + } + + virtual void Push(const TRec* rec) { + while (SourceRec && TCompare<TRec>()(SourceRec, rec, 0) < 0) { + if (MergeType == MT_OVERWRITE || MergeType == MT_ADD) + Receiver.Push(SourceRec); + SourceRec = Source.Next(); + } + + bool intersected = false; + while (SourceRec && TCompare<TRec>()(SourceRec, rec, 0) == 0) { + intersected = true; + if (MergeType == MT_ADD) + Receiver.Push(SourceRec); + SourceRec = Source.Next(); + } + + if (intersected && MergeType == MT_JOIN) + Receiver.Push(rec); + + if (MergeType == MT_OVERWRITE || MergeType == MT_ADD) + Receiver.Push(rec); + } + + virtual void Term() { + if (MergeType == MT_OVERWRITE || MergeType == MT_ADD) { + while (SourceRec) { + Receiver.Push(SourceRec); + SourceRec = Source.Next(); + } + } + } + +private: + TSource& Source; + const TRec* SourceRec; + TReceiver& Receiver; + ui8 MergeType; +};*/ + +/*template <class TRec, class TSourceA, class TSourceB, template <typename T> class TCompare, class TReceiver = TPushOutDatFile<TRec> > +class TNextDatMerger: public TPushDatMerger<TRec, TSourceA, TCompare, TReceiver> { + typedef TPushDatMerger<TRec, TSourceA, TCompare, TReceiver> TImpl; +public: + TNextDatMerger(TSourceA& sourceA, TSourceB& sourceB, TReceiver& receiver, ui8 mergeType) + : TImpl(sourceA, receiver, mergeType) + , SourceB(sourceB) + { + } + + virtual void Work() { + TImpl::Init(); + while (SourceBRec = SourceB.Next()) { + TImpl::Push(SourceBRec); + } + TImpl::Term(); + } +private: + TSourceB& SourceB; + const TRec* SourceBRec; +};*/ + +/*template <class TRec, template <typename T> class TCompare, class TReceiver = TPushOutDatFile<TRec> > +class TFilePushDatMerger: public TPushDatMerger<TRec, TFastInDatFile<TRec>, TCompare, TReceiver> { + typedef TPushDatMerger<TRec, TFastInDatFile<TRec>, TCompare, TReceiver> TImpl; +public: + TFilePushDatMerger(const char* name, TReceiver& receiver, ui8 mergeType) + : TImpl(SourceFile, receiver, mergeType) + , SourceFile(name) + { + } + + virtual void Push(const TRec* rec) { + TImpl::Push(rec); + } + + virtual void Term() { + TImpl::Term(); + } +private: + TFastInDatFile<TRec> SourceFile; +};*/ + +/*template <class TRec, template <typename T> class TCompare, class TReceiver = TPushOutDatFile<TRec> > +class TFileNextDatMerger: public TNextDatMerger<TRec, TFastInDatFile<TRec>, TFastInDatFile<TRec>, TCompare, TReceiver> { + typedef TNextDatMerger<TRec, TFastInDatFile<TRec>, TFastInDatFile<TRec>, TCompare, TReceiver> TImpl; +public: + TFileNextDatMerger(const char* sourceAname, const char* sourceBname, TReceiver& receiver, ui8 mergeType) + : TImpl(FileA, FileB, receiver, mergeType) + , FileA(sourceAname) + , FileB(sourceBname) + { + } + + virtual void Work() { + TImpl::Work(); + } +private: + TFastInDatFile<TRec> FileA; + TFastInDatFile<TRec> FileB; +};*/ + +template <class TRec, template <typename T> class TPredicate> +class TDatNextFilter: public IDatNextChannel<TRec, TRec> { +public: + TDatNextFilter(IDatNextSource<TRec>& source) + : IDatNextChannel<TRec, TRec>(source) + { + } + + virtual const TRec* Next() { + const TRec* rec; + while ((rec = IDatNextChannel<TRec, TRec>::Source.Next()) != 0 && !Check(rec)) { + } + if (!rec) + return 0; + return rec; + } + +protected: + virtual bool Check(const TRec* rec) { + return TPredicate<TRec>()(rec); + } +}; + +template <class TRec, template <typename T> class TPredicate> +class TDatPushFilter: public IDatPushChannel<TRec, TRec> { +public: + TDatPushFilter(IDatPushReceiver<TRec>& receiver) + : IDatPushChannel<TRec, TRec>(receiver) + { + } + + virtual void Push(const TRec* rec) { + if (Check(rec)) + IDatPushChannel<TRec, TRec>::Receiver.Push(rec); + } + +private: + virtual bool Check(const TRec* rec) { + return TPredicate<TRec>()(rec); + } +}; + +template <class TInRec, class TOutRec, template <typename T> class TCompare> +class TDatGrouper: public IDatNextChannel<TInRec, TOutRec> { +public: + TDatGrouper(IDatNextSource<TInRec>& source) + : IDatNextChannel<TInRec, TOutRec>(source) + , Begin(true) + , Finish(false) + , HasOutput(false) + { + } + + const TOutRec* Next() { + while (CurrentRec = IDatNextChannel<TInRec, TOutRec>::Source.Next()) { + int cmp = 0; + if (Begin) { + Begin = false; + OnStart(); + } else if ((cmp = TCompare<TInRec>()(CurrentRec, LastRec, 0)) != 0) { + OnFinish(); + OnStart(); + } + OnRecord(); + LastRec = CurrentRec; + if (HasOutput) { + HasOutput = false; + return &OutRec; + } + } + if (!Finish) + OnFinish(); + Finish = true; + if (HasOutput) { + HasOutput = false; + return &OutRec; + } + return 0; + } + +protected: + virtual void OnStart() = 0; + virtual void OnRecord() = 0; + virtual void OnFinish() = 0; + + const TInRec* CurrentRec; + const TInRec* LastRec; + TOutRec OutRec; + + bool Begin; + bool Finish; + bool HasOutput; +}; diff --git a/library/cpp/microbdb/ya.make b/library/cpp/microbdb/ya.make new file mode 100644 index 0000000000..3e553f8535 --- /dev/null +++ b/library/cpp/microbdb/ya.make @@ -0,0 +1,36 @@ +LIBRARY() + +SRCS( + align.h + compressed.h + extinfo.h + file.cpp + hashes.h + header.h + header.cpp + heap.h + input.h + microbdb.cpp + noextinfo.proto + output.h + powersorter.h + reader.h + safeopen.h + sorter.h + sorterdef.h + utility.h + wrappers.h +) + +PEERDIR( + contrib/libs/fastlz + contrib/libs/libc_compat + contrib/libs/protobuf + contrib/libs/snappy + contrib/libs/zlib + library/cpp/deprecated/fgood + library/cpp/on_disk/st_hash + library/cpp/packedtypes +) + +END() diff --git a/library/cpp/on_disk/CMakeLists.txt b/library/cpp/on_disk/CMakeLists.txt index 4202947169..ade3b33c9a 100644 --- a/library/cpp/on_disk/CMakeLists.txt +++ b/library/cpp/on_disk/CMakeLists.txt @@ -7,3 +7,4 @@ add_subdirectory(chunks) +add_subdirectory(st_hash) diff --git a/library/cpp/on_disk/st_hash/CMakeLists.darwin-x86_64.txt b/library/cpp/on_disk/st_hash/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..ad332fef62 --- /dev/null +++ b/library/cpp/on_disk/st_hash/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,18 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-on_disk-st_hash) +target_link_libraries(cpp-on_disk-st_hash PUBLIC + contrib-libs-cxxsupp + yutil + cpp-deprecated-mapped_file +) +target_sources(cpp-on_disk-st_hash PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/on_disk/st_hash/fake.cpp +) diff --git a/library/cpp/on_disk/st_hash/CMakeLists.linux-aarch64.txt b/library/cpp/on_disk/st_hash/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..737875ca6c --- /dev/null +++ b/library/cpp/on_disk/st_hash/CMakeLists.linux-aarch64.txt @@ -0,0 +1,19 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-on_disk-st_hash) +target_link_libraries(cpp-on_disk-st_hash PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + cpp-deprecated-mapped_file +) +target_sources(cpp-on_disk-st_hash PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/on_disk/st_hash/fake.cpp +) diff --git a/library/cpp/on_disk/st_hash/CMakeLists.linux-x86_64.txt b/library/cpp/on_disk/st_hash/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..737875ca6c --- /dev/null +++ b/library/cpp/on_disk/st_hash/CMakeLists.linux-x86_64.txt @@ -0,0 +1,19 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-on_disk-st_hash) +target_link_libraries(cpp-on_disk-st_hash PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + cpp-deprecated-mapped_file +) +target_sources(cpp-on_disk-st_hash PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/on_disk/st_hash/fake.cpp +) diff --git a/library/cpp/on_disk/st_hash/CMakeLists.txt b/library/cpp/on_disk/st_hash/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/library/cpp/on_disk/st_hash/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/library/cpp/on_disk/st_hash/CMakeLists.windows-x86_64.txt b/library/cpp/on_disk/st_hash/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..ad332fef62 --- /dev/null +++ b/library/cpp/on_disk/st_hash/CMakeLists.windows-x86_64.txt @@ -0,0 +1,18 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-on_disk-st_hash) +target_link_libraries(cpp-on_disk-st_hash PUBLIC + contrib-libs-cxxsupp + yutil + cpp-deprecated-mapped_file +) +target_sources(cpp-on_disk-st_hash PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/on_disk/st_hash/fake.cpp +) diff --git a/library/cpp/on_disk/st_hash/fake.cpp b/library/cpp/on_disk/st_hash/fake.cpp new file mode 100644 index 0000000000..ef5af4d432 --- /dev/null +++ b/library/cpp/on_disk/st_hash/fake.cpp @@ -0,0 +1,4 @@ +#include "save_stl.h" +#include "static_hash.h" +#include "static_hash_map.h" +#include "sthash_iterators.h" diff --git a/library/cpp/on_disk/st_hash/save_stl.h b/library/cpp/on_disk/st_hash/save_stl.h new file mode 100644 index 0000000000..00f8f0e20d --- /dev/null +++ b/library/cpp/on_disk/st_hash/save_stl.h @@ -0,0 +1,84 @@ +#pragma once + +#include <util/generic/hash.h> +#include <util/system/yassert.h> +#include <util/stream/output.h> + +// this structure might be replaced with sthashtable class +template <class HF, class Eq, class size_type> +struct sthashtable_nvm_sv { + sthashtable_nvm_sv() { + if (sizeof(sthashtable_nvm_sv) != sizeof(HF) + sizeof(Eq) + 3 * sizeof(size_type)) { + memset(this, 0, sizeof(sthashtable_nvm_sv)); + } + } + + sthashtable_nvm_sv(const HF& phf, const Eq& peq, const size_type& pnb, const size_type& pne, const size_type& pnd) + : sthashtable_nvm_sv() + { + hf = phf; + eq = peq; + num_buckets = pnb; + num_elements = pne; + data_end_off = pnd; + } + + HF hf; + Eq eq; + size_type num_buckets; + size_type num_elements; + size_type data_end_off; +}; + +/** + * Some hack to save both THashMap and sthash. + * Working with stHash does not depend on the template parameters, because the content of stHash is not used inside this method. + */ +template <class V, class K, class HF, class Ex, class Eq, class A> +template <class KeySaver> +inline int THashTable<V, K, HF, Ex, Eq, A>::save_for_st(IOutputStream* stream, KeySaver& ks, sthash<int, int, THash<int>, TEqualTo<int>, typename KeySaver::TSizeType>* stHash) const { + Y_ASSERT(!stHash || stHash->bucket_count() == bucket_count()); + typedef sthashtable_nvm_sv<HF, Eq, typename KeySaver::TSizeType> sv_type; + sv_type sv = {this->_get_hash_fun(), this->_get_key_eq(), static_cast<typename KeySaver::TSizeType>(buckets.size()), static_cast<typename KeySaver::TSizeType>(num_elements), 0}; + // to do: m.b. use just the size of corresponding object? + typename KeySaver::TSizeType cur_off = sizeof(sv_type) + + (sv.num_buckets + 1) * sizeof(typename KeySaver::TSizeType); + sv.data_end_off = cur_off; + const_iterator n; + for (n = begin(); n != end(); ++n) { + sv.data_end_off += static_cast<typename KeySaver::TSizeType>(ks.GetRecordSize(*n)); + } + typename KeySaver::TSizeType* sb = stHash ? (typename KeySaver::TSizeType*)(stHash->buckets()) : nullptr; + if (stHash) + sv.data_end_off += static_cast<typename KeySaver::TSizeType>(sb[buckets.size()] - sb[0]); + //saver.Align(sizeof(char*)); + stream->Write(&sv, sizeof(sv)); + + size_type i; + //save vector + for (i = 0; i < buckets.size(); ++i) { + node* cur = buckets[i]; + stream->Write(&cur_off, sizeof(cur_off)); + if (cur) { + while (!((uintptr_t)cur & 1)) { + cur_off += static_cast<typename KeySaver::TSizeType>(ks.GetRecordSize(cur->val)); + cur = cur->next; + } + } + if (stHash) + cur_off += static_cast<typename KeySaver::TSizeType>(sb[i + 1] - sb[i]); + } + stream->Write(&cur_off, sizeof(cur_off)); // end mark + for (i = 0; i < buckets.size(); ++i) { + node* cur = buckets[i]; + if (cur) { + while (!((uintptr_t)cur & 1)) { + ks.SaveRecord(stream, cur->val); + cur = cur->next; + } + } + if (stHash) + stream->Write((const char*)stHash + sb[i], sb[i + 1] - sb[i]); + } + return 0; +} diff --git a/library/cpp/on_disk/st_hash/static_hash.h b/library/cpp/on_disk/st_hash/static_hash.h new file mode 100644 index 0000000000..ca7a6ccd36 --- /dev/null +++ b/library/cpp/on_disk/st_hash/static_hash.h @@ -0,0 +1,420 @@ +#pragma once + +#include "save_stl.h" +#include "sthash_iterators.h" + +#include <util/generic/hash.h> +#include <util/generic/vector.h> +#include <util/generic/buffer.h> +#include <util/generic/cast.h> +#include <util/generic/yexception.h> // for save/load only +#include <util/stream/file.h> +#include <util/stream/buffer.h> +#include <utility> + +#include <memory> +#include <algorithm> +#include <functional> + +#include <cstdlib> +#include <cstddef> + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4624) // 'destructor could not be generated because a base class destructor is inaccessible' +#endif + +template <class HashType, class KeySaver> +inline void SaveHashToStreamEx(HashType& hash, IOutputStream* stream) { + KeySaver ks; + if (hash.save_for_st(stream, ks)) + ythrow yexception() << "Could not save hash to stream"; +} + +template <class HashType> +inline void SaveHashToStream(HashType& hash, IOutputStream* stream) { + typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui64> KeySaver; + return SaveHashToStreamEx<HashType, KeySaver>(hash, stream); +} + +template <class HashType, class KeySaver> +inline void SaveHashToFileEx(HashType& hash, const char* fileName) { + TFileOutput output(fileName); + SaveHashToStreamEx<HashType, KeySaver>(hash, &output); +} + +template <class HashType> +inline void SaveHashToFile(HashType& hash, const char* fileName) { + typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui64> KeySaver; + return SaveHashToFileEx<HashType, KeySaver>(hash, fileName); +} + +template <class HashType> +inline void SaveHashSetToFile(HashType& hash, const char* fileName) { + typedef TSthashSetWriter<typename HashType::key_type, ui64> KeySaver; + return SaveHashToFileEx<HashType, KeySaver>(hash, fileName); +} + +template <class HashType> +inline void SaveHashToFile32(HashType& hash, const char* fileName) { + typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui32> KeySaver; + return SaveHashToFileEx<HashType, KeySaver>(hash, fileName); +} + +template <class HashType, class KeySaver> +inline void SaveHashToBufferEx(HashType& hash, TBuffer& buffer, sthash<int, int, THash<int>, TEqualTo<int>, typename KeySaver::TSizeType>* stHash = nullptr) { + TBufferOutput stream(buffer); + KeySaver ks; + if (hash.save_for_st(&stream, ks, stHash)) + ythrow yexception() << "Could not save hash to memory"; +} + +template <class HashType> +inline void SaveHashToBuffer(HashType& hash, TBuffer& buffer) { + typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui64> KeySaver; + SaveHashToBufferEx<HashType, KeySaver>(hash, buffer); +} + +/** + * Some hack to save both THashMap and sthash. + * THashMap and sthash must have same bucket_count(). + */ +template <class HashType, class StHashType> +inline void SaveHashToBuffer(HashType& hash, TBuffer& buffer, StHashType* stHash) { + typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui64> KeySaver; + typedef sthash<int, int, THash<int>, TEqualTo<int>, typename KeySaver::TSizeType>* SH; + + SH sh = reinterpret_cast<SH>(stHash); + SaveHashToBufferEx<HashType, KeySaver>(hash, buffer, sh); +} + +template <class HashType> +inline void SaveHashToBuffer32(HashType& hash, TBuffer& buffer) { + typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui32> KeySaver; + SaveHashToBufferEx<HashType, KeySaver>(hash, buffer); +} + +template <class Iter, typename size_type_f = ui64> +class sthashtable { +public: + typedef typename Iter::TKeyType key_type; + typedef typename Iter::TValueType value_type; + typedef typename Iter::THasherType hasher; + typedef typename Iter::TKeyEqualType key_equal; + + typedef size_type_f size_type; + typedef ptrdiff_t difference_type; + typedef const value_type* const_pointer; + typedef const value_type& const_reference; + + typedef Iter const_iterator; + + const hasher hash_funct() const { + return hash; + } + const key_equal key_eq() const { + return equals; + } + +private: + const hasher hash; + const key_equal equals; + +private: + const_iterator iter_at_bucket(size_type bucket) const { + return (const_iterator)(((char*)this + buckets()[bucket])); + } + + const_iterator iter_at_bucket_or_end(size_type bucket) const { + if (bucket < num_buckets) + return (const_iterator)(((char*)this + buckets()[bucket])); + else + return end(); + } + + const size_type num_buckets; + const size_type num_elements; + const size_type data_end_off; + +protected: //shut up gcc warning + // we can't construct/destroy this object at all! + sthashtable(); + sthashtable(const sthashtable& ht); + ~sthashtable(); + +public: + // const size_type *buckets; + const size_type* buckets() const { + return (size_type*)((char*)this + sizeof(*this)); + } + const size_type buckets(size_type n) const { + return buckets()[n]; + } + + size_type size() const { + return num_elements; + } + size_type max_size() const { + return size_type(-1); + } + bool empty() const { + return size() == 0; + } + + const_iterator begin() const { + return num_buckets ? iter_at_bucket(0) : end(); + } + + const_iterator end() const { + return (const_iterator)(((char*)this + data_end_off)); + } + +public: + size_type size_in_bytes() const { + return data_end_off; + } + + size_type bucket_count() const { + return num_buckets; + } + + size_type elems_in_bucket(size_type bucket) const { + size_type result = 0; + const_iterator first = iter_at_bucket(bucket); + const_iterator last = iter_at_bucket_or_end(bucket + 1); + + for (; first != last; ++first) + ++result; + return result; + } + + template <class TheKey> + const_iterator find(const TheKey& key) const { + size_type n = bkt_num_key(key); + const_iterator first(iter_at_bucket(n)), last(iter_at_bucket_or_end(n + 1)); + for (; + first != last && !first.KeyEquals(equals, key); + ++first) { + } + if (first != last) + return first; + return end(); + } + + size_type count(const key_type& key) const { + const size_type n = bkt_num_key(key); + size_type result = 0; + const_iterator first = iter_at_bucket(n); + const_iterator last = iter_at_bucket_or_end(n + 1); + + for (; first != last; ++first) + if (first.KeyEquals(equals, key)) + ++result; + return result; + } + + std::pair<const_iterator, const_iterator> equal_range(const key_type& key) const; + +private: + template <class TheKey> + size_type bkt_num_key(const TheKey& key) const { + return hash(key) % num_buckets; + } +}; + +template <class I, class size_type_f> +std::pair<I, I> sthashtable<I, size_type_f>::equal_range(const key_type& key) const { + typedef std::pair<const_iterator, const_iterator> pii; + const size_type n = bkt_num_key(key); + const_iterator first = iter_at_bucket(n); + const_iterator last = iter_at_bucket_or_end(n + 1); + + for (; first != last; ++first) { + if (first.KeyEquals(equals, key)) { + const_iterator cur = first; + ++cur; + for (; cur != last; ++cur) + if (!cur.KeyEquals(equals, key)) + return pii(const_iterator(first), + const_iterator(cur)); + return pii(const_iterator(first), + const_iterator(last)); + } + } + return pii(end(), end()); +} + +/* end __SGI_STL_HASHTABLE_H */ + +template <class Key, class T, class HashFcn /*= hash<Key>*/, + class EqualKey = TEqualTo<Key>, typename size_type_f = ui64> +class sthash { +private: + typedef sthashtable<TSthashIterator<const Key, const T, HashFcn, EqualKey>, size_type_f> ht; + ht rep; + +public: + typedef typename ht::key_type key_type; + typedef typename ht::value_type value_type; + typedef typename ht::hasher hasher; + typedef typename ht::key_equal key_equal; + typedef T mapped_type; + + typedef typename ht::size_type size_type; + typedef typename ht::difference_type difference_type; + typedef typename ht::const_pointer const_pointer; + typedef typename ht::const_reference const_reference; + + typedef typename ht::const_iterator const_iterator; + + const hasher hash_funct() const { + return rep.hash_funct(); + } + const key_equal key_eq() const { + return rep.key_eq(); + } + +public: + size_type size() const { + return rep.size(); + } + size_type max_size() const { + return rep.max_size(); + } + bool empty() const { + return rep.empty(); + } + + const_iterator begin() const { + return rep.begin(); + } + const_iterator end() const { + return rep.end(); + } + +public: + template <class TheKey> + const_iterator find(const TheKey& key) const { + return rep.find(key); + } + template <class TheKey> + bool has(const TheKey& key) const { + return rep.find(key) != rep.end(); + } + + size_type count(const key_type& key) const { + return rep.count(key); + } + + std::pair<const_iterator, const_iterator> equal_range(const key_type& key) const { + return rep.equal_range(key); + } + + size_type size_in_bytes() const { + return rep.size_in_bytes(); + } + + size_type bucket_count() const { + return rep.bucket_count(); + } + size_type max_bucket_count() const { + return rep.max_bucket_count(); + } + size_type elems_in_bucket(size_type n) const { + return rep.elems_in_bucket(n); + } + + const size_type* buckets() const { + return rep.buckets(); + } + const size_type buckets(size_type n) const { + return rep.buckets()[n]; + } +}; + +template <class Key, class HashFcn, + class EqualKey = TEqualTo<Key>, typename size_type_f = ui64> +class sthash_set: public sthash<Key, TEmptyValue, HashFcn, EqualKey, size_type_f> { + typedef sthash<Key, TEmptyValue, HashFcn, EqualKey, size_type_f> Base; + +public: + using Base::const_iterator; + using Base::hasher; + using Base::key_equal; + using Base::key_type; + using Base::size_type; + using Base::value_type; +}; + +template <class Key, class T, class HashFcn /*= hash<Key>*/, + class EqualKey = TEqualTo<Key>, typename size_type_f = ui64> +class sthash_mm { +private: + typedef sthashtable<TSthashIterator<const Key, T, HashFcn, EqualKey>, size_type_f> ht; + ht rep; + +public: + typedef typename ht::key_type key_type; + typedef typename ht::value_type value_type; + typedef typename ht::hasher hasher; + typedef typename ht::key_equal key_equal; + typedef T mapped_type; + + typedef typename ht::size_type size_type; + typedef typename ht::difference_type difference_type; + typedef typename ht::const_pointer const_pointer; + typedef typename ht::const_reference const_reference; + + typedef typename ht::const_iterator const_iterator; + + const hasher hash_funct() const { + return rep.hash_funct(); + } + const key_equal key_eq() const { + return rep.key_eq(); + } + +public: + size_type size() const { + return rep.size(); + } + size_type max_size() const { + return rep.max_size(); + } + bool empty() const { + return rep.empty(); + } + + const_iterator begin() const { + return rep.begin(); + } + const_iterator end() const { + return rep.end(); + } + + const_iterator find(const key_type& key) const { + return rep.find(key); + } + + size_type count(const key_type& key) const { + return rep.count(key); + } + + std::pair<const_iterator, const_iterator> equal_range(const key_type& key) const { + return rep.equal_range(key); + } + + size_type bucket_count() const { + return rep.bucket_count(); + } + size_type max_bucket_count() const { + return rep.max_bucket_count(); + } + size_type elems_in_bucket(size_type n) const { + return rep.elems_in_bucket(n); + } +}; + +#ifdef _MSC_VER +#pragma warning(pop) +#endif diff --git a/library/cpp/on_disk/st_hash/static_hash_map.h b/library/cpp/on_disk/st_hash/static_hash_map.h new file mode 100644 index 0000000000..5dc50abd39 --- /dev/null +++ b/library/cpp/on_disk/st_hash/static_hash_map.h @@ -0,0 +1,59 @@ +#pragma once + +#include "static_hash.h" + +#include <library/cpp/deprecated/mapped_file/mapped_file.h> + +#include <util/system/filemap.h> + +template <class SH> +struct sthash_mapped_c { + typedef SH H; + typedef typename H::const_iterator const_iterator; + TMappedFile M; + H* hsh; + sthash_mapped_c() + : M() + , hsh(nullptr) + { + } + sthash_mapped_c(const char* fname, bool precharge) + : M() + , hsh(nullptr) + { + Open(fname, precharge); + } + void Open(const char* fname, bool precharge) { + M.init(fname); + if (precharge) + M.precharge(); + hsh = (H*)M.getData(); + if (M.getSize() < sizeof(H) || (ssize_t)M.getSize() != hsh->end().Data - (char*)hsh) + ythrow yexception() << "Could not map hash: " << fname << " is damaged"; + } + H* operator->() { + return hsh; + } + const H* operator->() const { + return hsh; + } + H* GetSthash() { + return hsh; + } + const H* GetSthash() const { + return hsh; + } +}; + +template <class Key, class T, class Hash> +struct sthash_mapped: public sthash_mapped_c<sthash<Key, T, Hash>> { + typedef sthash<Key, T, Hash> H; + sthash_mapped(const char* fname, bool precharge) + : sthash_mapped_c<H>(fname, precharge) + { + } + sthash_mapped() + : sthash_mapped_c<H>() + { + } +}; diff --git a/library/cpp/on_disk/st_hash/sthash_iterators.h b/library/cpp/on_disk/st_hash/sthash_iterators.h new file mode 100644 index 0000000000..6a9ebdd6c3 --- /dev/null +++ b/library/cpp/on_disk/st_hash/sthash_iterators.h @@ -0,0 +1,334 @@ +#pragma once + +#include "save_stl.h" + +#include <util/system/align.h> + +/** + This file provides functionality for saving some relatively simple THashMap object + to disk in a form that can be mapped read-only (via mmap) at any address. + That saved object is accessed via pointer to sthash object (that must have + the same parameters as original THashMap object) + + If either key or value are variable-sized (i.e. contain pointers), user must + write his own instantiation of TSthashIterator (read iterator for sthash) and + TSthashWriter (write iterator for THashMap). + An example for <const char *, B> pair is in here. +**/ + +// TEmptyValue and SizeOfEx are helpers for sthash_set +struct TEmptyValue { + TEmptyValue() = default; +}; + +template <class T> +inline size_t SizeOfEx() { + return sizeof(T); +} + +template <> +inline size_t SizeOfEx<TEmptyValue>() { + return 0; +} +template <> +inline size_t SizeOfEx<const TEmptyValue>() { + return 0; +} + +template <class TKey, class TValue, class HashFcn, class EqualKey> +struct TSthashIterator { + // Implementation for simple types + typedef const TKey TKeyType; + typedef const TValue TValueType; + typedef EqualKey TKeyEqualType; + typedef HashFcn THasherType; + + const char* Data; + TSthashIterator() + : Data(nullptr) + { + } + explicit TSthashIterator(const char* data) + : Data(data) + { + } + void operator++() { + Data += GetLength(); + } + + bool operator!=(const TSthashIterator& that) const { + return Data != that.Data; + } + bool operator==(const TSthashIterator& that) const { + return Data == that.Data; + } + TKey& Key() const { + return *(TKey*)Data; + } + TValue& Value() { + return *(TValue*)(Data + sizeof(TKey)); + } + const TValue& Value() const { + return *(const TValue*)(Data + sizeof(TKey)); + } + + template <class AnotherKeyType> + bool KeyEquals(const EqualKey& eq, const AnotherKeyType& key) const { + return eq(*(TKey*)Data, key); + } + + size_t GetLength() const { + return sizeof(TKey) + SizeOfEx<TValue>(); + } +}; + +template <class Key, class Value, typename size_type_o = ui64> +struct TSthashWriter { + typedef size_type_o TSizeType; + size_t GetRecordSize(const std::pair<const Key, const Value>&) const { + return sizeof(Key) + SizeOfEx<Value>(); + } + int SaveRecord(IOutputStream* stream, const std::pair<const Key, const Value>& record) const { + stream->Write(&record.first, sizeof(Key)); + stream->Write(&record.second, SizeOfEx<Value>()); + return 0; + } +}; + +// Remember that this simplified implementation makes a copy of `key' in std::make_pair. +// It can also waste some memory on undesired alignment. +template <class Key, typename size_type_o = ui64> +struct TSthashSetWriter: public TSthashWriter<Key, TEmptyValue, size_type_o> { + typedef TSthashWriter<Key, TEmptyValue, size_type_o> MapWriter; + size_t GetRecordSize(const Key& key) const { + return MapWriter::GetRecordSize(std::make_pair(key, TEmptyValue())); + } + int SaveRecord(IOutputStream* stream, const Key& key) const { + return MapWriter::SaveRecord(stream, std::make_pair(key, TEmptyValue())); + } +}; + +// we can't save something with pointers without additional tricks + +template <class A, class B, class HashFcn, class EqualKey> +struct TSthashIterator<A*, B, HashFcn, EqualKey> {}; + +template <class A, class B, class HashFcn, class EqualKey> +struct TSthashIterator<A, B*, HashFcn, EqualKey> {}; + +template <class A, class B, typename size_type_o> +struct TSthashWriter<A*, B*, size_type_o> {}; + +template <class A, class B, typename size_type_o> +struct TSthashWriter<A*, B, size_type_o> {}; + +template <class A, class B, typename size_type_o> +struct TSthashWriter<A, B*, size_type_o> {}; + +template <class T> +inline size_t AlignForChrKey() { + return 4; // TODO: change this (requeres rebuilt of a few existing files) +} + +template <> +inline size_t AlignForChrKey<TEmptyValue>() { + return 1; +} + +template <> +inline size_t AlignForChrKey<const TEmptyValue>() { + return AlignForChrKey<TEmptyValue>(); +} + +// !! note that for char*, physical placement of key and value is swapped +template <class TValue, class HashFcn, class EqualKey> +struct TSthashIterator<const char* const, TValue, HashFcn, EqualKey> { + typedef const TValue TValueType; + typedef const char* TKeyType; + typedef EqualKey TKeyEqualType; + typedef HashFcn THasherType; + + const char* Data; + TSthashIterator() + : Data(nullptr) + { + } + TSthashIterator(const char* data) + : Data(data) + { + } + void operator++() { + Data += GetLength(); + } + + bool operator!=(const TSthashIterator& that) const { + return Data != that.Data; + } + bool operator==(const TSthashIterator& that) const { + return Data == that.Data; + } + const char* Key() const { + return Data + SizeOfEx<TValue>(); + } + TValue& Value() { + return *(TValue*)Data; + } + const TValue& Value() const { + return *(const TValue*)Data; + } + + template <class K> + bool KeyEquals(const EqualKey& eq, const K& k) const { + return eq(Data + SizeOfEx<TValue>(), k); + } + + size_t GetLength() const { + size_t length = strlen(Data + SizeOfEx<TValue>()) + 1 + SizeOfEx<TValue>(); + length = AlignUp(length, AlignForChrKey<TValue>()); + return length; + } +}; + +template <class Value, typename size_type_o> +struct TSthashWriter<const char*, Value, size_type_o> { + typedef size_type_o TSizeType; + size_t GetRecordSize(const std::pair<const char*, const Value>& record) const { + size_t length = strlen(record.first) + 1 + SizeOfEx<Value>(); + length = AlignUp(length, AlignForChrKey<Value>()); + return length; + } + int SaveRecord(IOutputStream* stream, const std::pair<const char*, const Value>& record) const { + const char* alignBuffer = "qqqq"; + stream->Write(&record.second, SizeOfEx<Value>()); + size_t length = strlen(record.first) + 1; + stream->Write(record.first, length); + length = AlignUpSpace(length, AlignForChrKey<Value>()); + if (length) + stream->Write(alignBuffer, length); + return 0; + } +}; + +template <class TKey, class HashFcn, class EqualKey> +struct TSthashIterator<TKey, const char* const, HashFcn, EqualKey> { + typedef const TKey TKeyType; + typedef const char* TValueType; + typedef EqualKey TKeyEqualType; + typedef HashFcn THasherType; + + const char* Data; + TSthashIterator() + : Data(nullptr) + { + } + TSthashIterator(const char* data) + : Data(data) + { + } + void operator++() { + Data += GetLength(); + } + + bool operator!=(const TSthashIterator& that) const { + return Data != that.Data; + } + bool operator==(const TSthashIterator& that) const { + return Data == that.Data; + } + TKey& Key() { + return *(TKey*)Data; + } + const char* Value() const { + return Data + sizeof(TKey); + } + + template <class K> + bool KeyEquals(const EqualKey& eq, const K& k) const { + return eq(*(TKey*)Data, k); + } + + size_t GetLength() const { + size_t length = strlen(Data + sizeof(TKey)) + 1 + sizeof(TKey); + length = AlignUp(length, (size_t)4); + return length; + } +}; + +template <class Key, typename size_type_o> +struct TSthashWriter<Key, const char*, size_type_o> { + typedef size_type_o TSizeType; + size_t GetRecordSize(const std::pair<const Key, const char*>& record) const { + size_t length = strlen(record.second) + 1 + sizeof(Key); + length = AlignUp(length, (size_t)4); + return length; + } + int SaveRecord(IOutputStream* stream, const std::pair<const Key, const char*>& record) const { + const char* alignBuffer = "qqqq"; + stream->Write(&record.first, sizeof(Key)); + size_t length = strlen(record.second) + 1; + stream->Write(record.second, length); + length = AlignUpSpace(length, (size_t)4); + if (length) + stream->Write(alignBuffer, length); + return 0; + } +}; + +template <class HashFcn, class EqualKey> +struct TSthashIterator<const char* const, const char* const, HashFcn, EqualKey> { + typedef const char* TKeyType; + typedef const char* TValueType; + typedef EqualKey TKeyEqualType; + typedef HashFcn THasherType; + + const char* Data; + TSthashIterator() + : Data(nullptr) + { + } + TSthashIterator(const char* data) + : Data(data) + { + } + void operator++() { + Data += GetLength(); + } + + bool operator!=(const TSthashIterator& that) const { + return Data != that.Data; + } + bool operator==(const TSthashIterator& that) const { + return Data == that.Data; + } + const char* Key() const { + return Data; + } + const char* Value() const { + return Data + strlen(Data) + 1; + } + + template <class K> + bool KeyEquals(const EqualKey& eq, const K& k) const { + return eq(Data, k); + } + + size_t GetLength() const { + size_t length = strlen(Data) + 1; + length += strlen(Data + length) + 1; + return length; + } +}; + +template <typename size_type_o> +struct TSthashWriter<const char*, const char*, size_type_o> { + typedef size_type_o TSizeType; + size_t GetRecordSize(const std::pair<const char*, const char*>& record) const { + size_t size = strlen(record.first) + strlen(record.second) + 2; + return size; + } + int SaveRecord(IOutputStream* stream, const std::pair<const char*, const char*>& record) const { + stream->Write(record.first, strlen(record.first) + 1); + stream->Write(record.second, strlen(record.second) + 1); + return 0; + } +}; diff --git a/library/cpp/on_disk/st_hash/ya.make b/library/cpp/on_disk/st_hash/ya.make new file mode 100644 index 0000000000..8c6d05711c --- /dev/null +++ b/library/cpp/on_disk/st_hash/ya.make @@ -0,0 +1,15 @@ +LIBRARY() + +SRCS( + fake.cpp + save_stl.h + static_hash.h + static_hash_map.h + sthash_iterators.h +) + +PEERDIR( + library/cpp/deprecated/mapped_file +) + +END() diff --git a/library/cpp/regex/CMakeLists.darwin-x86_64.txt b/library/cpp/regex/CMakeLists.darwin-x86_64.txt index 6e2a4fabcd..877d40538b 100644 --- a/library/cpp/regex/CMakeLists.darwin-x86_64.txt +++ b/library/cpp/regex/CMakeLists.darwin-x86_64.txt @@ -6,6 +6,7 @@ # original buildsystem will not be accepted. +add_subdirectory(glob) add_subdirectory(hyperscan) add_subdirectory(pcre) add_subdirectory(pire) diff --git a/library/cpp/regex/CMakeLists.linux-aarch64.txt b/library/cpp/regex/CMakeLists.linux-aarch64.txt index 279390306b..84c257a819 100644 --- a/library/cpp/regex/CMakeLists.linux-aarch64.txt +++ b/library/cpp/regex/CMakeLists.linux-aarch64.txt @@ -6,5 +6,6 @@ # original buildsystem will not be accepted. +add_subdirectory(glob) add_subdirectory(pcre) add_subdirectory(pire) diff --git a/library/cpp/regex/CMakeLists.linux-x86_64.txt b/library/cpp/regex/CMakeLists.linux-x86_64.txt index 6e2a4fabcd..877d40538b 100644 --- a/library/cpp/regex/CMakeLists.linux-x86_64.txt +++ b/library/cpp/regex/CMakeLists.linux-x86_64.txt @@ -6,6 +6,7 @@ # original buildsystem will not be accepted. +add_subdirectory(glob) add_subdirectory(hyperscan) add_subdirectory(pcre) add_subdirectory(pire) diff --git a/library/cpp/regex/CMakeLists.windows-x86_64.txt b/library/cpp/regex/CMakeLists.windows-x86_64.txt index 6e2a4fabcd..877d40538b 100644 --- a/library/cpp/regex/CMakeLists.windows-x86_64.txt +++ b/library/cpp/regex/CMakeLists.windows-x86_64.txt @@ -6,6 +6,7 @@ # original buildsystem will not be accepted. +add_subdirectory(glob) add_subdirectory(hyperscan) add_subdirectory(pcre) add_subdirectory(pire) diff --git a/library/cpp/regex/glob/CMakeLists.darwin-x86_64.txt b/library/cpp/regex/glob/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..ca8383e355 --- /dev/null +++ b/library/cpp/regex/glob/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,19 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-regex-glob) +target_link_libraries(cpp-regex-glob PUBLIC + contrib-libs-cxxsupp + yutil + library-cpp-charset +) +target_sources(cpp-regex-glob PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/regex/glob/glob.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/regex/glob/glob_iterator.cpp +) diff --git a/library/cpp/regex/glob/CMakeLists.linux-aarch64.txt b/library/cpp/regex/glob/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..3953937c6d --- /dev/null +++ b/library/cpp/regex/glob/CMakeLists.linux-aarch64.txt @@ -0,0 +1,20 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-regex-glob) +target_link_libraries(cpp-regex-glob PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + library-cpp-charset +) +target_sources(cpp-regex-glob PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/regex/glob/glob.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/regex/glob/glob_iterator.cpp +) diff --git a/library/cpp/regex/glob/CMakeLists.linux-x86_64.txt b/library/cpp/regex/glob/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..3953937c6d --- /dev/null +++ b/library/cpp/regex/glob/CMakeLists.linux-x86_64.txt @@ -0,0 +1,20 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-regex-glob) +target_link_libraries(cpp-regex-glob PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + library-cpp-charset +) +target_sources(cpp-regex-glob PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/regex/glob/glob.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/regex/glob/glob_iterator.cpp +) diff --git a/library/cpp/regex/glob/CMakeLists.txt b/library/cpp/regex/glob/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/library/cpp/regex/glob/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/library/cpp/regex/glob/CMakeLists.windows-x86_64.txt b/library/cpp/regex/glob/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..ca8383e355 --- /dev/null +++ b/library/cpp/regex/glob/CMakeLists.windows-x86_64.txt @@ -0,0 +1,19 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-regex-glob) +target_link_libraries(cpp-regex-glob PUBLIC + contrib-libs-cxxsupp + yutil + library-cpp-charset +) +target_sources(cpp-regex-glob PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/regex/glob/glob.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/regex/glob/glob_iterator.cpp +) diff --git a/library/cpp/regex/glob/glob.cpp b/library/cpp/regex/glob/glob.cpp new file mode 100644 index 0000000000..9da058122a --- /dev/null +++ b/library/cpp/regex/glob/glob.cpp @@ -0,0 +1,921 @@ +#define FROM_IMPLEMENTATION +#include "glob_compat.h" + +#if defined(USE_INTERNAL_GLOB) +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Guido van Rossum. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <library/cpp/charset/ci_string.h> +#include <util/system/compat.h> +#include <util/folder/dirut.h> + +/* + * glob(3) -- a superset of the one defined in POSIX 1003.2. + * + * The [!...] convention to negate a range is supported (SysV, Posix, ksh). + * + * Optional extra services, controlled by flags not defined by POSIX: + * + * GLOB_QUOTE: + * Escaping convention: \ inhibits any special meaning the following + * character might have (except \ at end of string is retained). + * GLOB_MAGCHAR: + * Set in gl_flags if pattern contained a globbing character. + * GLOB_NOMAGIC: + * Same as GLOB_NOCHECK, but it will only append pattern if it did + * not contain any magic characters. [Used in csh style globbing] + * GLOB_ALTDIRFUNC: + * Use alternately specified directory access functions. + * GLOB_TILDE: + * expand ~user/foo to the /home/dir/of/user/foo + * GLOB_BRACE: + * expand {1,2}{a,b} to 1a 1b 2a 2b + * gl_matchc: + * Number of matches in the current invocation of glob. + */ + +/* + * Some notes on multibyte character support: + * 1. Patterns with illegal byte sequences match nothing - even if + * GLOB_NOCHECK is specified. + * 2. Illegal byte sequences in filenames are handled by treating them as + * single-byte characters with a value of the first byte of the sequence + * cast to wchar_t. + * 3. State-dependent encodings are not currently supported. + */ + +//#include <sys/param.h> +#include <sys/stat.h> + +#include <ctype.h> +//#include <dirent.h> +#include <errno.h> +#include <limits.h> +//#include <pwd.h> +//#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#if defined(_unix_) +#include <unistd.h> +#endif +#include <wchar.h> + +#if !defined(_unix_) +// silly replacement for compilation +using uint_fast64_t = ui64; +using u_int = unsigned int; +using u_char = unsigned char; +#define ARG_MAX 256 +#define S_ISDIR(x) ((x) & _S_IFDIR) +#define S_ISLNK(x) 0 +#define lstat stat +inline bool issetugid() { return false; } +inline char *getlogin() { return 0; } +inline int getuid() { return 0; } +struct passwd { + char *pw_dir; +}; +inline passwd *getpwuid(int) { return 0; } +inline passwd *getpwnam(char *) { return 0; } +#endif + +#define __collate_load_error 1 +inline int __collate_range_cmp(int, int) { return 0; } +#undef COMMA // was defined in stroka.h +// end silly replacement + +//#include "collate.h" + +#define DOLLAR '$' +#define DOT '.' +#define EOS '\0' +#define LBRACKET '[' +#define NOT '!' +#define QUESTION '?' +#define QUOTE '\\' +#define RANGE '-' +#define RBRACKET ']' +#define SEP '/' +#define STAR '*' +#define TILDE '~' +#define UNDERSCORE '_' +#define LBRACE '{' +#define RBRACE '}' +#define SLASH '/' +#define COMMA ',' + +#ifndef DEBUG + +#define M_QUOTE 0x8000000000ULL +#define M_PROTECT 0x4000000000ULL +#define M_MASK 0xffffffffffULL +#define M_CHAR 0x00ffffffffULL + +using Char = uint_fast64_t; + +#else + +#define M_QUOTE 0x80 +#define M_PROTECT 0x40 +#define M_MASK 0xff +#define M_CHAR 0x7f + +using Char = char; + +#endif + + +#define CHAR(c) ((Char)((c)&M_CHAR)) +#define META(c) ((Char)((c)|M_QUOTE)) +#define M_ALL META('*') +#define M_END META(']') +#define M_NOT META('!') +#define M_ONE META('?') +#define M_RNG META('-') +#define M_SET META('[') +#define ismeta(c) (((c)&M_QUOTE) != 0) + + +static int compare(const void *, const void *); +static int g_Ctoc(const Char *, char *, u_int); +static int g_lstat(Char *, struct stat *, glob_t *); +static DIR *g_opendir(Char *, glob_t *); +static Char *g_strchr(Char *, wchar_t); +#ifdef notdef +static Char *g_strcat(Char *, const Char *); +#endif +static int glob0(const Char *, glob_t *, int *); +static int glob1(Char *, glob_t *, int *); +static int glob2(Char *, Char *, Char *, Char *, glob_t *, int *); +static int glob3(Char *, Char *, Char *, Char *, Char *, glob_t *, int *); +static int globextend(const Char *, glob_t *, int *); +static const Char * + globtilde(const Char *, Char *, size_t, glob_t *); +static int globexp1(const Char *, glob_t *, int *); +static int globexp2(const Char *, const Char *, glob_t *, int *, int *); +static int match(Char *, Char *, Char *); +#ifdef DEBUG +static void qprintf(const char *, Char *); +#endif + +int +glob(const char *pattern, int flags, int (*errfunc)(const char *, int), glob_t *pglob) +{ + const u_char *patnext; + int limit; + Char *bufnext, *bufend, patbuf[MAXPATHLEN], prot; + mbstate_t mbs; + wchar_t wc; + size_t clen; + + patnext = (u_char *) pattern; + if (!(flags & GLOB_APPEND)) { + pglob->gl_pathc = 0; + pglob->gl_pathv = NULL; + if (!(flags & GLOB_DOOFFS)) + pglob->gl_offs = 0; + } + if (flags & GLOB_LIMIT) { + limit = pglob->gl_matchc; + if (limit == 0) + limit = ARG_MAX; + } else + limit = 0; + pglob->gl_flags = flags & ~GLOB_MAGCHAR; + pglob->gl_errfunc = errfunc; + pglob->gl_matchc = 0; + + bufnext = patbuf; + bufend = bufnext + MAXPATHLEN - 1; + if (flags & GLOB_NOESCAPE) { + memset(&mbs, 0, sizeof(mbs)); + while (bufend - bufnext >= MB_CUR_MAX) { + clen = mbrtowc(&wc, (const char*)patnext, MB_LEN_MAX, &mbs); + if (clen == (size_t)-1 || clen == (size_t)-2) + return (GLOB_NOMATCH); + else if (clen == 0) + break; + *bufnext++ = wc; + patnext += clen; + } + } else { + /* Protect the quoted characters. */ + memset(&mbs, 0, sizeof(mbs)); + while (bufend - bufnext >= MB_CUR_MAX) { + if (*patnext == QUOTE) { + if (*++patnext == EOS) { + *bufnext++ = QUOTE | M_PROTECT; + continue; + } + prot = M_PROTECT; + } else + prot = 0; + clen = mbrtowc(&wc, (const char*)patnext, MB_LEN_MAX, &mbs); + if (clen == (size_t)-1 || clen == (size_t)-2) + return (GLOB_NOMATCH); + else if (clen == 0) + break; + *bufnext++ = wc | prot; + patnext += clen; + } + } + *bufnext = EOS; + + if (flags & GLOB_BRACE) + return globexp1(patbuf, pglob, &limit); + else + return glob0(patbuf, pglob, &limit); +} + +/* + * Expand recursively a glob {} pattern. When there is no more expansion + * invoke the standard globbing routine to glob the rest of the magic + * characters + */ +static int +globexp1(const Char *pattern, glob_t *pglob, int *limit) +{ + const Char* ptr = pattern; + int rv; + + /* Protect a single {}, for find(1), like csh */ + if (pattern[0] == LBRACE && pattern[1] == RBRACE && pattern[2] == EOS) + return glob0(pattern, pglob, limit); + + while ((ptr = (const Char *) g_strchr((Char *) ptr, LBRACE)) != NULL) + if (!globexp2(ptr, pattern, pglob, &rv, limit)) + return rv; + + return glob0(pattern, pglob, limit); +} + + +/* + * Recursive brace globbing helper. Tries to expand a single brace. + * If it succeeds then it invokes globexp1 with the new pattern. + * If it fails then it tries to glob the rest of the pattern and returns. + */ +static int +globexp2(const Char *ptr, const Char *pattern, glob_t *pglob, int *rv, int *limit) +{ + int i; + Char *lm, *ls; + const Char *pe, *pm, *pm1, *pl; + Char patbuf[MAXPATHLEN]; + + /* copy part up to the brace */ + for (lm = patbuf, pm = pattern; pm != ptr; *lm++ = *pm++) + continue; + *lm = EOS; + ls = lm; + + /* Find the balanced brace */ + for (i = 0, pe = ++ptr; *pe; pe++) + if (*pe == LBRACKET) { + /* Ignore everything between [] */ + for (pm = pe++; *pe != RBRACKET && *pe != EOS; pe++) + continue; + if (*pe == EOS) { + /* + * We could not find a matching RBRACKET. + * Ignore and just look for RBRACE + */ + pe = pm; + } + } + else if (*pe == LBRACE) + i++; + else if (*pe == RBRACE) { + if (i == 0) + break; + i--; + } + + /* Non matching braces; just glob the pattern */ + if (i != 0 || *pe == EOS) { + *rv = glob0(patbuf, pglob, limit); + return 0; + } + + for (i = 0, pl = pm = ptr; pm <= pe; pm++) + switch (*pm) { + case LBRACKET: + /* Ignore everything between [] */ + for (pm1 = pm++; *pm != RBRACKET && *pm != EOS; pm++) + continue; + if (*pm == EOS) { + /* + * We could not find a matching RBRACKET. + * Ignore and just look for RBRACE + */ + pm = pm1; + } + break; + + case LBRACE: + i++; + break; + + case RBRACE: + if (i) { + i--; + break; + } + [[fallthrough]]; + case COMMA: + if (i && *pm == COMMA) + break; + else { + /* Append the current string */ + for (lm = ls; (pl < pm); *lm++ = *pl++) + continue; + /* + * Append the rest of the pattern after the + * closing brace + */ + for (pl = pe + 1; (*lm++ = *pl++) != EOS;) + continue; + + /* Expand the current pattern */ +#ifdef DEBUG + qprintf("globexp2:", patbuf); +#endif + *rv = globexp1(patbuf, pglob, limit); + + /* move after the comma, to the next string */ + pl = pm + 1; + } + break; + + default: + break; + } + *rv = 0; + return 0; +} + + + +/* + * expand tilde from the passwd file. + */ +static const Char * +globtilde(const Char *pattern, Char *patbuf, size_t patbuf_len, glob_t *pglob) +{ + struct passwd *pwd; + char *h; + const Char *p; + Char *b, *eb; + + if (*pattern != TILDE || !(pglob->gl_flags & GLOB_TILDE)) + return pattern; + + /* + * Copy up to the end of the string or / + */ + eb = &patbuf[patbuf_len - 1]; + for (p = pattern + 1, h = (char *) patbuf; + h < (char *)eb && *p && *p != SLASH; *h++ = (char)*p++) + continue; + + *h = EOS; + + if (((char *) patbuf)[0] == EOS) { + /* + * handle a plain ~ or ~/ by expanding $HOME first (iff + * we're not running setuid or setgid) and then trying + * the password file + */ + if (issetugid() != 0 || + (h = ::getenv("HOME")) == NULL) { + if (((h = getlogin()) != NULL && + (pwd = getpwnam(h)) != NULL) || + (pwd = getpwuid(getuid())) != NULL) + h = pwd->pw_dir; + else + return pattern; + } + } + else { + /* + * Expand a ~user + */ + if ((pwd = getpwnam((char*) patbuf)) == NULL) + return pattern; + else + h = pwd->pw_dir; + } + + /* Copy the home directory */ + for (b = patbuf; b < eb && *h; *b++ = *h++) + continue; + + /* Append the rest of the pattern */ + while (b < eb && (*b++ = *p++) != EOS) + continue; + *b = EOS; + + return patbuf; +} + + +/* + * The main glob() routine: compiles the pattern (optionally processing + * quotes), calls glob1() to do the real pattern matching, and finally + * sorts the list (unless unsorted operation is requested). Returns 0 + * if things went well, nonzero if errors occurred. + */ +static int +glob0(const Char *pattern, glob_t *pglob, int *limit) +{ + const Char *qpatnext; + int c, err, oldpathc; + Char *bufnext, patbuf[MAXPATHLEN]; + + qpatnext = globtilde(pattern, patbuf, MAXPATHLEN, pglob); + oldpathc = pglob->gl_pathc; + bufnext = patbuf; + + /* We don't need to check for buffer overflow any more. */ + while ((c = (char)*qpatnext++) != EOS) { + switch (c) { + case LBRACKET: + c = (char)*qpatnext; + if (c == NOT) + ++qpatnext; + if (*qpatnext == EOS || + g_strchr((Char *) qpatnext+1, RBRACKET) == NULL) { + *bufnext++ = LBRACKET; + if (c == NOT) + --qpatnext; + break; + } + *bufnext++ = M_SET; + if (c == NOT) + *bufnext++ = M_NOT; + c = (char)*qpatnext++; + do { + *bufnext++ = CHAR(c); + if (*qpatnext == RANGE && + (c = (char)qpatnext[1]) != RBRACKET) { + *bufnext++ = M_RNG; + *bufnext++ = CHAR(c); + qpatnext += 2; + } + } while ((c = (char)*qpatnext++) != RBRACKET); + pglob->gl_flags |= GLOB_MAGCHAR; + *bufnext++ = M_END; + break; + case QUESTION: + pglob->gl_flags |= GLOB_MAGCHAR; + *bufnext++ = M_ONE; + break; + case STAR: + pglob->gl_flags |= GLOB_MAGCHAR; + /* collapse adjacent stars to one, + * to avoid exponential behavior + */ + if (bufnext == patbuf || bufnext[-1] != M_ALL) + *bufnext++ = M_ALL; + break; + default: + *bufnext++ = CHAR(c); + break; + } + } + *bufnext = EOS; +#ifdef DEBUG + qprintf("glob0:", patbuf); +#endif + + if ((err = glob1(patbuf, pglob, limit)) != 0) + return(err); + + /* + * If there was no match we are going to append the pattern + * if GLOB_NOCHECK was specified or if GLOB_NOMAGIC was specified + * and the pattern did not contain any magic characters + * GLOB_NOMAGIC is there just for compatibility with csh. + */ + if (pglob->gl_pathc == oldpathc) { + if (((pglob->gl_flags & GLOB_NOCHECK) || + ((pglob->gl_flags & GLOB_NOMAGIC) && + !(pglob->gl_flags & GLOB_MAGCHAR)))) + return(globextend(pattern, pglob, limit)); + else + return(GLOB_NOMATCH); + } + if (!(pglob->gl_flags & GLOB_NOSORT)) + qsort(pglob->gl_pathv + pglob->gl_offs + oldpathc, + pglob->gl_pathc - oldpathc, sizeof(char *), compare); + return(0); +} + +static int +compare(const void *p, const void *q) +{ + return(strcmp(*(char **)p, *(char **)q)); +} + +static int +glob1(Char *pattern, glob_t *pglob, int *limit) +{ + Char pathbuf[MAXPATHLEN]; + + /* A null pathname is invalid -- POSIX 1003.1 sect. 2.4. */ + if (*pattern == EOS) + return(0); + return(glob2(pathbuf, pathbuf, pathbuf + MAXPATHLEN - 1, + pattern, pglob, limit)); +} + +/* + * The functions glob2 and glob3 are mutually recursive; there is one level + * of recursion for each segment in the pattern that contains one or more + * meta characters. + */ +static int +glob2(Char *pathbuf, Char *pathend, Char *pathend_last, Char *pattern, glob_t *pglob, int *limit) +{ + struct stat sb; + Char *p, *q; + int anymeta; + + /* + * Loop over pattern segments until end of pattern or until + * segment with meta character found. + */ + for (anymeta = 0;;) { + if (*pattern == EOS) { /* End of pattern? */ + *pathend = EOS; + if (g_lstat(pathbuf, &sb, pglob)) + return(0); + + if (((pglob->gl_flags & GLOB_MARK) && + pathend[-1] != SEP) && (S_ISDIR(sb.st_mode))) { + if (pathend + 1 > pathend_last) + return (GLOB_ABORTED); + *pathend++ = SEP; + *pathend = EOS; + } + ++pglob->gl_matchc; + return(globextend(pathbuf, pglob, limit)); + } + + /* Find end of next segment, copy tentatively to pathend. */ + q = pathend; + p = pattern; + while (*p != EOS && *p != SEP) { + if (ismeta(*p)) + anymeta = 1; + if (q + 1 > pathend_last) + return (GLOB_ABORTED); + *q++ = *p++; + } + + if (!anymeta) { /* No expansion, do next segment. */ + pathend = q; + pattern = p; + while (*pattern == SEP) { + if (pathend + 1 > pathend_last) + return (GLOB_ABORTED); + *pathend++ = *pattern++; + } + } else /* Need expansion, recurse. */ + return(glob3(pathbuf, pathend, pathend_last, pattern, p, + pglob, limit)); + } + /* NOTREACHED */ +} + +static int +glob3(Char *pathbuf, Char *pathend, Char *pathend_last, Char *pattern, Char *restpattern, glob_t *pglob, int *limit) +{ + struct dirent *dp; + DIR *dirp; + int err; + char buf[MAXPATHLEN]; + + /* + * The readdirfunc declaration can't be prototyped, because it is + * assigned, below, to two functions which are prototyped in glob.h + * and dirent.h as taking pointers to differently typed opaque + * structures. + */ + typedef struct dirent *(*readdirfunc_t)(void*); + readdirfunc_t readdirfunc; + + if (pathend > pathend_last) + return (GLOB_ABORTED); + *pathend = EOS; + errno = 0; + + if ((dirp = g_opendir(pathbuf, pglob)) == NULL) { + /* TODO: don't call for ENOENT or ENOTDIR? */ + if (pglob->gl_errfunc) { + if (g_Ctoc(pathbuf, buf, sizeof(buf))) + return (GLOB_ABORTED); + if (pglob->gl_errfunc(buf, errno) || + pglob->gl_flags & GLOB_ERR) + return (GLOB_ABORTED); + } + return(0); + } + + err = 0; + + /* Search directory for matching names. */ + if (pglob->gl_flags & GLOB_ALTDIRFUNC) + readdirfunc = pglob->gl_readdir; + else + readdirfunc = (readdirfunc_t)readdir; + while ((dp = (*readdirfunc)(dirp))) { + u_char *sc; + Char *dc; + wchar_t wc; + size_t clen; + mbstate_t mbs; + + /* Initial DOT must be matched literally. */ + if (dp->d_name[0] == DOT && *pattern != DOT) + continue; + memset(&mbs, 0, sizeof(mbs)); + dc = pathend; + sc = (u_char *) dp->d_name; + while (dc < pathend_last) { + clen = mbrtowc(&wc, (const char*)sc, MB_LEN_MAX, &mbs); + if (clen == (size_t)-1 || clen == (size_t)-2) { + wc = *sc; + clen = 1; + memset(&mbs, 0, sizeof(mbs)); + } + if ((*dc++ = wc) == EOS) + break; + sc += clen; + } + if (!match(pathend, pattern, restpattern)) { + *pathend = EOS; + continue; + } + err = glob2(pathbuf, --dc, pathend_last, restpattern, + pglob, limit); + if (err) + break; + } + + if (pglob->gl_flags & GLOB_ALTDIRFUNC) + (*pglob->gl_closedir)(dirp); + else + closedir(dirp); + return(err); +} + + +/* + * Extend the gl_pathv member of a glob_t structure to accomodate a new item, + * add the new item, and update gl_pathc. + * + * This assumes the BSD realloc, which only copies the block when its size + * crosses a power-of-two boundary; for v7 realloc, this would cause quadratic + * behavior. + * + * Return 0 if new item added, error code if memory couldn't be allocated. + * + * Invariant of the glob_t structure: + * Either gl_pathc is zero and gl_pathv is NULL; or gl_pathc > 0 and + * gl_pathv points to (gl_offs + gl_pathc + 1) items. + */ +static int +globextend(const Char *path, glob_t *pglob, int *limit) +{ + char **pathv; + int i; + size_t newsize, len; + char *copy; + const Char *p; + + if (*limit && pglob->gl_pathc > *limit) { + errno = 0; + return (GLOB_NOSPACE); + } + + newsize = sizeof(*pathv) * (2 + pglob->gl_pathc + pglob->gl_offs); + pathv = pglob->gl_pathv ? + (char**)realloc((char *)pglob->gl_pathv, newsize) : + (char**)malloc(newsize); + if (pathv == NULL) { + if (pglob->gl_pathv) { + free(pglob->gl_pathv); + pglob->gl_pathv = NULL; + } + return(GLOB_NOSPACE); + } + + if (pglob->gl_pathv == NULL && pglob->gl_offs > 0) { + /* first time around -- clear initial gl_offs items */ + pathv += pglob->gl_offs; + for (i = pglob->gl_offs; --i >= 0; ) + *--pathv = NULL; + } + pglob->gl_pathv = pathv; + + for (p = path; *p++;) + continue; + len = MB_CUR_MAX * (size_t)(p - path); /* XXX overallocation */ + if ((copy = (char*)malloc(len)) != NULL) { + if (g_Ctoc(path, copy, (u_int)len)) { + free(copy); + return (GLOB_NOSPACE); + } + pathv[pglob->gl_offs + pglob->gl_pathc++] = copy; + } + pathv[pglob->gl_offs + pglob->gl_pathc] = NULL; + return(copy == NULL ? GLOB_NOSPACE : 0); +} + +/* + * pattern matching function for filenames. Each occurrence of the * + * pattern causes a recursion level. + */ +static int +match(Char *name, Char *pat, Char *patend) +{ + int ok, negate_range; + Char c, k; + + while (pat < patend) { + c = *pat++; + switch (c & M_MASK) { + case M_ALL: + if (pat == patend) + return(1); + do + if (match(name, pat, patend)) + return(1); + while (*name++ != EOS); + return(0); + case M_ONE: + if (*name++ == EOS) + return(0); + break; + case M_SET: + ok = 0; + if ((k = *name++) == EOS) + return(0); + if ((negate_range = ((*pat & M_MASK) == M_NOT)) != EOS) + ++pat; + while (((c = *pat++) & M_MASK) != M_END) + if ((*pat & M_MASK) == M_RNG) { + if (__collate_load_error ? + CHAR(c) <= CHAR(k) && CHAR(k) <= CHAR(pat[1]) : + __collate_range_cmp((int)CHAR(c), (int)CHAR(k)) <= 0 + && __collate_range_cmp((int)CHAR(k), (int)CHAR(pat[1])) <= 0 + ) + ok = 1; + pat += 2; + } else if (c == k) + ok = 1; + if (ok == negate_range) + return(0); + break; + default: + if (*name++ != c) + return(0); + break; + } + } + return(*name == EOS); +} + +/* Free allocated data belonging to a glob_t structure. */ +void +globfree(glob_t *pglob) +{ + int i; + char **pp; + + if (pglob->gl_pathv != NULL) { + pp = pglob->gl_pathv + pglob->gl_offs; + for (i = pglob->gl_pathc; i--; ++pp) + if (*pp) + free(*pp); + free(pglob->gl_pathv); + pglob->gl_pathv = NULL; + } +} + +static DIR * +g_opendir(Char *str, glob_t *pglob) +{ + char buf[MAXPATHLEN]; + + if (!*str) + strcpy(buf, "."); + else { + if (g_Ctoc(str, buf, sizeof(buf))) + return (NULL); + } + + if (pglob->gl_flags & GLOB_ALTDIRFUNC) + return (DIR*)((*pglob->gl_opendir)(buf)); + + return(opendir(buf)); +} + +static int +g_lstat(Char *fn, struct stat *sb, glob_t *pglob) +{ + char buf[MAXPATHLEN]; + + if (g_Ctoc(fn, buf, sizeof(buf))) { + errno = ENAMETOOLONG; + return (-1); + } + if (pglob->gl_flags & GLOB_ALTDIRFUNC) + return((*pglob->gl_lstat)(buf, sb)); + return(lstat(buf, sb)); +} + +static Char * +g_strchr(Char *str, wchar_t ch) +{ + do { + if (*str == ch) + return (str); + } while (*str++); + return (NULL); +} + +static int +g_Ctoc(const Char *str, char *buf, u_int len) +{ + mbstate_t mbs; + size_t clen; + + memset(&mbs, 0, sizeof(mbs)); + while ((int)len >= MB_CUR_MAX) { + clen = wcrtomb(buf, (wchar_t)*str, &mbs); + if (clen == (size_t)-1) + return (1); + if (*str == L'\0') + return (0); + str++; + buf += clen; + len -= (u_int)clen; + } + return (1); +} + +#ifdef DEBUG +static void +qprintf(const char *str, Char *s) +{ + Char *p; + + (void)printf("%s:\n", str); + for (p = s; *p; p++) + (void)printf("%c", CHAR(*p)); + (void)printf("\n"); + for (p = s; *p; p++) + (void)printf("%c", *p & M_PROTECT ? '"' : ' '); + (void)printf("\n"); + for (p = s; *p; p++) + (void)printf("%c", ismeta(*p) ? '_' : ' '); + (void)printf("\n"); +} +#endif +#endif diff --git a/library/cpp/regex/glob/glob_compat.h b/library/cpp/regex/glob/glob_compat.h new file mode 100644 index 0000000000..0dc518d51b --- /dev/null +++ b/library/cpp/regex/glob/glob_compat.h @@ -0,0 +1,73 @@ +#pragma once + +#include <util/system/defaults.h> + +#if defined(_MSC_VER) || defined(_bionic_) +#define USE_INTERNAL_GLOB +#endif + +#if !defined(USE_INTERNAL_GLOB) +#include <glob.h> +#else + +struct stat; +typedef struct { + int gl_pathc; /* Count of total paths so far. */ + int gl_matchc; /* Count of paths matching pattern. */ + int gl_offs; /* Reserved at beginning of gl_pathv. */ + int gl_flags; /* Copy of flags parameter to glob. */ + char** gl_pathv; /* List of paths matching pattern. */ + /* Copy of errfunc parameter to glob. */ + int (*gl_errfunc)(const char*, int); + + /* + * Alternate filesystem access methods for glob; replacement + * versions of closedir(3), readdir(3), opendir(3), stat(2) + * and lstat(2). + */ + void (*gl_closedir)(void*); + struct dirent* (*gl_readdir)(void*); + void* (*gl_opendir)(const char*); + int (*gl_lstat)(const char*, struct stat*); + int (*gl_stat)(const char*, struct stat*); +} glob_t; + +//#if __POSIX_VISIBLE >= 199209 +/* Believed to have been introduced in 1003.2-1992 */ +#define GLOB_APPEND 0x0001 /* Append to output from previous call. */ +#define GLOB_DOOFFS 0x0002 /* Use gl_offs. */ +#define GLOB_ERR 0x0004 /* Return on error. */ +#define GLOB_MARK 0x0008 /* Append / to matching directories. */ +#define GLOB_NOCHECK 0x0010 /* Return pattern itself if nothing matches. */ +#define GLOB_NOSORT 0x0020 /* Don't sort. */ +#define GLOB_NOESCAPE 0x2000 /* Disable backslash escaping. */ + +/* Error values returned by glob(3) */ +#define GLOB_NOSPACE (-1) /* Malloc call failed. */ +#define GLOB_ABORTED (-2) /* Unignored error. */ +#define GLOB_NOMATCH (-3) /* No match and GLOB_NOCHECK was not set. */ +#define GLOB_NOSYS (-4) /* Obsolete: source comptability only. */ +//#endif /* __POSIX_VISIBLE >= 199209 */ + +//#if __BSD_VISIBLE +#define GLOB_ALTDIRFUNC 0x0040 /* Use alternately specified directory funcs. */ +#define GLOB_BRACE 0x0080 /* Expand braces ala csh. */ +#define GLOB_MAGCHAR 0x0100 /* Pattern had globbing characters. */ +#define GLOB_NOMAGIC 0x0200 /* GLOB_NOCHECK without magic chars (csh). */ +#define GLOB_QUOTE 0x0400 /* Quote special chars with \. */ +#define GLOB_TILDE 0x0800 /* Expand tilde names from the passwd file. */ +#define GLOB_LIMIT 0x1000 /* limit number of returned paths */ + +/* source compatibility, these are the old names */ +#define GLOB_MAXPATH GLOB_LIMIT +#define GLOB_ABEND GLOB_ABORTED +//#endif /* __BSD_VISIBLE */ + +int glob(const char*, int, int (*)(const char*, int), glob_t*); +void globfree(glob_t*); + +#endif /* _MSC_VER */ + +#if !defined(FROM_IMPLEMENTATION) +#undef USE_INTERNAL_GLOB +#endif diff --git a/library/cpp/regex/glob/glob_iterator.cpp b/library/cpp/regex/glob/glob_iterator.cpp new file mode 100644 index 0000000000..746b49f397 --- /dev/null +++ b/library/cpp/regex/glob/glob_iterator.cpp @@ -0,0 +1 @@ +#include "glob_iterator.h" diff --git a/library/cpp/regex/glob/glob_iterator.h b/library/cpp/regex/glob/glob_iterator.h new file mode 100644 index 0000000000..e25481e594 --- /dev/null +++ b/library/cpp/regex/glob/glob_iterator.h @@ -0,0 +1,36 @@ +#pragma once + +#include "glob_compat.h" + +#include <util/generic/noncopyable.h> +#include <util/generic/string.h> +#include <util/generic/yexception.h> + +class TGlobPaths : TNonCopyable { +public: + TGlobPaths(const char* pattern) { + Impl.gl_pathc = 0; + int result = glob(pattern, 0, nullptr, &Impl); + Y_ENSURE(result == 0 || result == GLOB_NOMATCH, "glob failed"); + } + + TGlobPaths(const TString& pattern) + : TGlobPaths(pattern.data()) + { + } + + ~TGlobPaths() { + globfree(&Impl); + } + + const char** begin() { + return const_cast<const char**>(Impl.gl_pathv); + } + + const char** end() { + return const_cast<const char**>(Impl.gl_pathv + Impl.gl_pathc); + } + +private: + glob_t Impl; +}; diff --git a/library/cpp/regex/glob/ya.make b/library/cpp/regex/glob/ya.make new file mode 100644 index 0000000000..9379742d99 --- /dev/null +++ b/library/cpp/regex/glob/ya.make @@ -0,0 +1,12 @@ +LIBRARY() + +SRCS( + glob.cpp + glob_iterator.cpp +) + +PEERDIR( + library/cpp/charset +) + +END() diff --git a/library/cpp/reverse_geocoder/CMakeLists.txt b/library/cpp/reverse_geocoder/CMakeLists.txt new file mode 100644 index 0000000000..621e95fdb2 --- /dev/null +++ b/library/cpp/reverse_geocoder/CMakeLists.txt @@ -0,0 +1,11 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +add_subdirectory(core) +add_subdirectory(library) +add_subdirectory(proto) diff --git a/library/cpp/reverse_geocoder/core/CMakeLists.darwin-x86_64.txt b/library/cpp/reverse_geocoder/core/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..17f6e79c96 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,35 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-reverse_geocoder-core) +target_link_libraries(cpp-reverse_geocoder-core PUBLIC + contrib-libs-cxxsupp + yutil + cpp-reverse_geocoder-library + cpp-reverse_geocoder-proto + cpp-digest-crc32c +) +target_sources(cpp-reverse_geocoder-core PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/area_box.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/bbox.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/common.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/edge.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/reverse_geocoder.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/kv.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/location.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/part.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/point.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/polygon.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/region.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/debug.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/def.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/geo_data.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/map.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/proxy.cpp +) diff --git a/library/cpp/reverse_geocoder/core/CMakeLists.linux-aarch64.txt b/library/cpp/reverse_geocoder/core/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..02361a0a1a --- /dev/null +++ b/library/cpp/reverse_geocoder/core/CMakeLists.linux-aarch64.txt @@ -0,0 +1,36 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-reverse_geocoder-core) +target_link_libraries(cpp-reverse_geocoder-core PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + cpp-reverse_geocoder-library + cpp-reverse_geocoder-proto + cpp-digest-crc32c +) +target_sources(cpp-reverse_geocoder-core PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/area_box.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/bbox.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/common.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/edge.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/reverse_geocoder.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/kv.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/location.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/part.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/point.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/polygon.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/region.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/debug.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/def.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/geo_data.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/map.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/proxy.cpp +) diff --git a/library/cpp/reverse_geocoder/core/CMakeLists.linux-x86_64.txt b/library/cpp/reverse_geocoder/core/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..02361a0a1a --- /dev/null +++ b/library/cpp/reverse_geocoder/core/CMakeLists.linux-x86_64.txt @@ -0,0 +1,36 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-reverse_geocoder-core) +target_link_libraries(cpp-reverse_geocoder-core PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + cpp-reverse_geocoder-library + cpp-reverse_geocoder-proto + cpp-digest-crc32c +) +target_sources(cpp-reverse_geocoder-core PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/area_box.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/bbox.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/common.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/edge.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/reverse_geocoder.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/kv.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/location.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/part.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/point.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/polygon.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/region.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/debug.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/def.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/geo_data.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/map.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/proxy.cpp +) diff --git a/library/cpp/reverse_geocoder/core/CMakeLists.txt b/library/cpp/reverse_geocoder/core/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/library/cpp/reverse_geocoder/core/CMakeLists.windows-x86_64.txt b/library/cpp/reverse_geocoder/core/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..17f6e79c96 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/CMakeLists.windows-x86_64.txt @@ -0,0 +1,35 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-reverse_geocoder-core) +target_link_libraries(cpp-reverse_geocoder-core PUBLIC + contrib-libs-cxxsupp + yutil + cpp-reverse_geocoder-library + cpp-reverse_geocoder-proto + cpp-digest-crc32c +) +target_sources(cpp-reverse_geocoder-core PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/area_box.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/bbox.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/common.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/edge.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/reverse_geocoder.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/kv.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/location.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/part.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/point.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/polygon.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/region.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/debug.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/def.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/geo_data.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/map.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/proxy.cpp +) diff --git a/library/cpp/reverse_geocoder/core/area_box.cpp b/library/cpp/reverse_geocoder/core/area_box.cpp new file mode 100644 index 0000000000..67038fe4f8 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/area_box.cpp @@ -0,0 +1,9 @@ +#include "area_box.h" + +using namespace NReverseGeocoder; + +TRef NReverseGeocoder::LookupAreaBox(const TPoint& point) { + const TRef boxX = (point.X - NAreaBox::LowerX) / NAreaBox::DeltaX; + const TRef boxY = (point.Y - NAreaBox::LowerY) / NAreaBox::DeltaY; + return boxX * NAreaBox::NumberY + boxY; +} diff --git a/library/cpp/reverse_geocoder/core/area_box.h b/library/cpp/reverse_geocoder/core/area_box.h new file mode 100644 index 0000000000..1077a65fef --- /dev/null +++ b/library/cpp/reverse_geocoder/core/area_box.h @@ -0,0 +1,34 @@ +#pragma once + +#include "common.h" +#include "point.h" + +namespace NReverseGeocoder { + namespace NAreaBox { + const TCoordinate LowerX = ToCoordinate(-180.0); + const TCoordinate UpperX = ToCoordinate(180.0); + const TCoordinate LowerY = ToCoordinate(-90.0); + const TCoordinate UpperY = ToCoordinate(90.0); + const TCoordinate DeltaX = ToCoordinate(0.1); + const TCoordinate DeltaY = ToCoordinate(0.1); + const TCoordinate NumberX = (UpperX - LowerX) / DeltaX; + const TCoordinate NumberY = (UpperY - LowerY) / DeltaY; + const TCoordinate Number = NumberX * NumberY; + + } + + // Area of geo territory. Variable PolygonRefsOffset refers to the polygons lying inside this + // area. Geo map is divided into equal bounding boxes from (NAreaBox::LowerX, NAreaBox::LowerY) + // to (NAreaBox::UpperX, NAreaBox::UpperY) with DeltaX and DeltaY sizes. Logic of filling is in + // generator. + struct Y_PACKED TAreaBox { + TNumber PolygonRefsOffset; + TNumber PolygonRefsNumber; + }; + + static_assert(sizeof(TAreaBox) == 8, "NReverseGeocoder::TAreaBox size mismatch"); + + // Determine in wich area box in geoData is point. + TRef LookupAreaBox(const TPoint& point); + +} diff --git a/library/cpp/reverse_geocoder/core/bbox.cpp b/library/cpp/reverse_geocoder/core/bbox.cpp new file mode 100644 index 0000000000..aa4258ac22 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/bbox.cpp @@ -0,0 +1 @@ +#include "bbox.h" diff --git a/library/cpp/reverse_geocoder/core/bbox.h b/library/cpp/reverse_geocoder/core/bbox.h new file mode 100644 index 0000000000..e8b6e00aa3 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/bbox.h @@ -0,0 +1,66 @@ +#pragma once + +#include "common.h" +#include "point.h" + +#include <util/generic/utility.h> + +namespace NReverseGeocoder { + struct Y_PACKED TBoundingBox { + TCoordinate X1; + TCoordinate Y1; + TCoordinate X2; + TCoordinate Y2; + + TBoundingBox() + : X1(0) + , Y1(0) + , X2(0) + , Y2(0) + { + } + + TBoundingBox(TCoordinate x1, TCoordinate y1, TCoordinate x2, TCoordinate y2) + : X1(x1) + , Y1(y1) + , X2(x2) + , Y2(y2) + { + } + + TBoundingBox(const TPoint* points, TNumber number) { + Init(); + for (TNumber i = 0; i < number; ++i) + Relax(points[i]); + } + + void Init() { + X1 = ToCoordinate(180.0); + Y1 = ToCoordinate(90.0); + X2 = ToCoordinate(-180.0); + Y2 = ToCoordinate(-90.0); + } + + void Relax(const TPoint& p) { + X1 = Min(X1, p.X); + Y1 = Min(Y1, p.Y); + X2 = Max(X2, p.X); + Y2 = Max(Y2, p.Y); + } + + bool HasIntersection(const TBoundingBox& r) const { + if (X1 > r.X2 || X2 < r.X1 || Y1 > r.Y2 || Y2 < r.Y1) + return false; + return true; + } + + bool Contains(const TPoint& p) const { + if (p.X < X1 || p.X > X2 || p.Y < Y1 || p.Y > Y2) + return false; + return true; + } + }; + + static_assert(sizeof(TBoundingBox) == 16, "NReverseGeocoder::TBoundingBox size mismatch"); + +} diff --git a/library/cpp/reverse_geocoder/core/common.cpp b/library/cpp/reverse_geocoder/core/common.cpp new file mode 100644 index 0000000000..67c02a20a0 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/common.cpp @@ -0,0 +1 @@ +#include "common.h" diff --git a/library/cpp/reverse_geocoder/core/common.h b/library/cpp/reverse_geocoder/core/common.h new file mode 100644 index 0000000000..090407ffd9 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/common.h @@ -0,0 +1,24 @@ +#pragma once + +#include <util/system/compiler.h> +#include <util/system/types.h> + +namespace NReverseGeocoder { + using TCoordinate = i32; + using TGeoId = ui64; + using TNumber = ui32; + using TRef = ui32; + using TSquare = i64; + using TVersion = ui64; + + const double EARTH_RADIUS = 6371000.0; + + inline TCoordinate ToCoordinate(double x) { + return x * 1e6; + } + + inline double ToDouble(TCoordinate x) { + return x / 1e6; + } + +} diff --git a/library/cpp/reverse_geocoder/core/edge.cpp b/library/cpp/reverse_geocoder/core/edge.cpp new file mode 100644 index 0000000000..86c6ab8535 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/edge.cpp @@ -0,0 +1 @@ +#include "edge.h" diff --git a/library/cpp/reverse_geocoder/core/edge.h b/library/cpp/reverse_geocoder/core/edge.h new file mode 100644 index 0000000000..9d20928857 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/edge.h @@ -0,0 +1,101 @@ +#pragma once + +#include "common.h" +#include "point.h" + +#include <util/generic/utility.h> +#include <util/system/yassert.h> + +namespace NReverseGeocoder { + // TEdge is a type, which represent polygon edge, Beg/End refers on begin/End edge points in + // geographical data. + struct Y_PACKED TEdge { + TRef Beg; + TRef End; + + TEdge() + : Beg(0) + , End(0) + { + } + + TEdge(const TRef& a, const TRef& b) + : Beg(a) + , End(b) + { + } + + bool operator==(const TEdge& e) const { + return Beg == e.Beg && End == e.End; + } + + bool operator!=(const TEdge& e) const { + return Beg != e.Beg || End != e.End; + } + + bool operator<(const TEdge& e) const { + return Beg < e.Beg || (Beg == e.Beg && End < e.End); + } + + // Checks that current edge is lying lower then other edge. Both edges must have a common X + // values, otherwise the behavior is undefined. + bool Lower(const TEdge& e, const TPoint* points) const { + if (*this == e) + return false; + + const TPoint& a1 = points[Beg]; + const TPoint& a2 = points[End]; + const TPoint& b1 = points[e.Beg]; + const TPoint& b2 = points[e.End]; + + Y_ASSERT(a1.X <= a2.X && b1.X <= b2.X); + + if (a1 == b1) { + return (a2 - a1).Cross(b2 - a1) > 0; + } else if (a2 == b2) { + return (a1 - b1).Cross(b2 - b1) > 0; + } else if (b1.X >= a1.X && b1.X <= a2.X) { + return (a2 - a1).Cross(b1 - a1) > 0; + } else if (b2.X >= a1.X && b2.X <= a2.X) { + return (a2 - a1).Cross(b2 - a1) > 0; + } else if (a1.X >= b1.X && a1.X <= b2.X) { + return (a1 - b1).Cross(b2 - b1) > 0; + } else if (a2.X >= b1.X && a2.X <= b2.X) { + return (a2 - b1).Cross(b2 - b1) > 0; + } else { + return false; + } + } + + // Checks that current edge lying lower then given point. Edge and point must have a common X + // values, otherwise the behavior is undefined. + bool Lower(const TPoint& p, const TPoint* points) const { + if (Contains(p, points)) + return false; + + TPoint a = points[Beg]; + TPoint b = points[End]; + + if (a.X > b.X) + DoSwap(a, b); + + return (b - a).Cross(p - a) > 0; + } + + bool Contains(const TPoint& p, const TPoint* points) const { + TPoint a = points[Beg]; + TPoint b = points[End]; + + if (a.X > b.X) + DoSwap(a, b); + + if (p.X < a.X || p.X > b.X) + return false; + + return (b - a).Cross(p - a) == 0; + } + }; + + static_assert(sizeof(TEdge) == 8, "NReverseGeocoder::TEdge size mismatch"); + +} diff --git a/library/cpp/reverse_geocoder/core/geo_data/debug.cpp b/library/cpp/reverse_geocoder/core/geo_data/debug.cpp new file mode 100644 index 0000000000..4db0534b22 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/geo_data/debug.cpp @@ -0,0 +1,74 @@ +#include "debug.h" + +#include <library/cpp/reverse_geocoder/library/log.h> +#include <library/cpp/reverse_geocoder/library/memory.h> + +using namespace NReverseGeocoder; +using namespace NGeoData; + +size_t NReverseGeocoder::NGeoData::Space(const IGeoData& g) { + size_t space = 0; + +#define GEO_BASE_DEF_VAR(TVar, Var) \ + space += sizeof(TVar); + +#define GEO_BASE_DEF_ARR(TArr, Arr) \ + space += sizeof(TNumber) + sizeof(TArr) * g.Arr##Number(); + + GEO_BASE_DEF_GEO_DATA + +#undef GEO_BASE_DEF_VAR +#undef GEO_BASE_DEF_ARR + + return space; +} + +template <typename TArr> +static float ArraySpace(TNumber number) { + return number * sizeof(TArr) * 1.0 / MB; +} + +void NReverseGeocoder::NGeoData::Show(IOutputStream& out, const IGeoData& g) { + out << "GeoData = " << NGeoData::Space(g) * 1.0 / GB << " GB" << '\n'; + +#define GEO_BASE_DEF_VAR(TVar, Var) \ + out << " GeoData." << #Var << " = " << (unsigned long long)g.Var() << '\n'; + +#define GEO_BASE_DEF_ARR(TArr, Arr) \ + out << " GeoData." << #Arr << " = " \ + << g.Arr##Number() << " x " << sizeof(TArr) << " = " \ + << ArraySpace<TArr>(g.Arr##Number()) << " MB" \ + << '\n'; + + GEO_BASE_DEF_GEO_DATA + +#undef GEO_BASE_DEF_VAR +#undef GEO_BASE_DEF_ARR +} + +template <typename TArr> +static bool Equals(const TArr* a, const TArr* b, size_t count) { + return !memcmp(a, b, sizeof(TArr) * count); +} + +bool NReverseGeocoder::NGeoData::Equals(const IGeoData& a, const IGeoData& b) { +#define GEO_BASE_DEF_VAR(TVar, Var) \ + if (a.Var() != b.Var()) { \ + LogError(#Var " not equal"); \ + return false; \ + } + +#define GEO_BASE_DEF_ARR(TArr, Arr) \ + GEO_BASE_DEF_VAR(TNumber, Arr##Number); \ + if (!::Equals(a.Arr(), b.Arr(), a.Arr##Number())) { \ + LogError(#Arr " not equal"); \ + return false; \ + } + + GEO_BASE_DEF_GEO_DATA + +#undef GEO_BASE_DEF_VAR +#undef GEO_BASE_DEF_ARR + + return true; +} diff --git a/library/cpp/reverse_geocoder/core/geo_data/debug.h b/library/cpp/reverse_geocoder/core/geo_data/debug.h new file mode 100644 index 0000000000..e7a4d9029c --- /dev/null +++ b/library/cpp/reverse_geocoder/core/geo_data/debug.h @@ -0,0 +1,16 @@ +#pragma once + +#include "geo_data.h" + +#include <util/stream/output.h> + +namespace NReverseGeocoder { + namespace NGeoData { + size_t Space(const IGeoData& g); + + void Show(IOutputStream& out, const IGeoData& g); + + bool Equals(const IGeoData& a, const IGeoData& b); + + } +} diff --git a/library/cpp/reverse_geocoder/core/geo_data/def.cpp b/library/cpp/reverse_geocoder/core/geo_data/def.cpp new file mode 100644 index 0000000000..bb9f760d73 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/geo_data/def.cpp @@ -0,0 +1 @@ +#include "def.h" diff --git a/library/cpp/reverse_geocoder/core/geo_data/def.h b/library/cpp/reverse_geocoder/core/geo_data/def.h new file mode 100644 index 0000000000..d3e331d873 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/geo_data/def.h @@ -0,0 +1,35 @@ +#pragma once + +#include <library/cpp/reverse_geocoder/core/area_box.h> +#include <library/cpp/reverse_geocoder/core/common.h> +#include <library/cpp/reverse_geocoder/core/edge.h> +#include <library/cpp/reverse_geocoder/core/kv.h> +#include <library/cpp/reverse_geocoder/core/part.h> +#include <library/cpp/reverse_geocoder/core/point.h> +#include <library/cpp/reverse_geocoder/core/polygon.h> +#include <library/cpp/reverse_geocoder/core/region.h> + +namespace NReverseGeocoder { + const TVersion GEO_DATA_VERSION_0 = 0; + const TVersion GEO_DATA_VERSION_1 = 1; + + const TVersion GEO_DATA_CURRENT_VERSION = GEO_DATA_VERSION_1; + +// Geographical data definition. This define need for reflection in map/unmap, show, etc. +#define GEO_BASE_DEF_GEO_DATA \ + GEO_BASE_DEF_VAR(TVersion, Version); \ + GEO_BASE_DEF_ARR(TPoint, Points); \ + GEO_BASE_DEF_ARR(TEdge, Edges); \ + GEO_BASE_DEF_ARR(TRef, EdgeRefs); \ + GEO_BASE_DEF_ARR(TPart, Parts); \ + GEO_BASE_DEF_ARR(TPolygon, Polygons); \ + GEO_BASE_DEF_ARR(TRef, PolygonRefs); \ + GEO_BASE_DEF_ARR(TAreaBox, Boxes); \ + GEO_BASE_DEF_ARR(char, Blobs); \ + GEO_BASE_DEF_ARR(TKv, Kvs); \ + GEO_BASE_DEF_ARR(TRegion, Regions); \ + GEO_BASE_DEF_ARR(TRawPolygon, RawPolygons); \ + GEO_BASE_DEF_ARR(TRef, RawEdgeRefs); \ + // #define GEO_BASE_DEF_GEO_DATA + +} diff --git a/library/cpp/reverse_geocoder/core/geo_data/geo_data.cpp b/library/cpp/reverse_geocoder/core/geo_data/geo_data.cpp new file mode 100644 index 0000000000..be3310b291 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/geo_data/geo_data.cpp @@ -0,0 +1 @@ +#include "geo_data.h" diff --git a/library/cpp/reverse_geocoder/core/geo_data/geo_data.h b/library/cpp/reverse_geocoder/core/geo_data/geo_data.h new file mode 100644 index 0000000000..7cb76bcddc --- /dev/null +++ b/library/cpp/reverse_geocoder/core/geo_data/geo_data.h @@ -0,0 +1,24 @@ +#pragma once + +#include "def.h" + +namespace NReverseGeocoder { + class IGeoData { +#define GEO_BASE_DEF_VAR(TVar, Var) \ + virtual const TVar& Var() const = 0; + +#define GEO_BASE_DEF_ARR(TArr, Arr) \ + virtual const TArr* Arr() const = 0; \ + virtual TNumber Arr##Number() const = 0; + + public: + GEO_BASE_DEF_GEO_DATA + +#undef GEO_BASE_DEF_VAR +#undef GEO_BASE_DEF_ARR + + virtual ~IGeoData() { + } + }; + +} diff --git a/library/cpp/reverse_geocoder/core/geo_data/map.cpp b/library/cpp/reverse_geocoder/core/geo_data/map.cpp new file mode 100644 index 0000000000..312f7d7cb0 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/geo_data/map.cpp @@ -0,0 +1,203 @@ +#include "map.h" + +#include <library/cpp/reverse_geocoder/library/log.h> +#include <library/cpp/reverse_geocoder/library/system.h> +#include <library/cpp/reverse_geocoder/proto/geo_data.pb.h> + +#include <library/cpp/digest/crc32c/crc32c.h> + +#include <util/generic/algorithm.h> +#include <util/generic/buffer.h> +#include <util/generic/vector.h> +#include <util/network/address.h> +#include <util/system/filemap.h> +#include <util/system/unaligned_mem.h> + +using namespace NReverseGeocoder; + +static const TNumber CRC_SIZE = 3; + +void NReverseGeocoder::TGeoDataMap::Init() { +#define GEO_BASE_DEF_VAR(TVar, Var) \ + Var##_ = TVar(); + +#define GEO_BASE_DEF_ARR(TArr, Arr) \ + Arr##_ = nullptr; \ + Arr##Number_ = 0; + + GEO_BASE_DEF_GEO_DATA + +#undef GEO_BASE_DEF_VAR +#undef GEO_BASE_DEF_ARR +} + +NReverseGeocoder::TGeoDataMap::TGeoDataMap() + : Data_(nullptr) + , Size_(0) +{ + Init(); +} + +static bool CheckMemoryConsistency(const NProto::TGeoData& g) { + TVector<std::pair<intptr_t, intptr_t>> segments; + +#define GEO_BASE_DEF_VAR(TVar, Var) \ + // undef + +#define GEO_BASE_DEF_ARR(TArr, Arr) \ + if (g.Get##Arr##Number() > 0) { \ + intptr_t const beg = g.Get##Arr(); \ + intptr_t const end = g.Get##Arr() + g.Get##Arr##Number() * sizeof(TArr); \ + segments.emplace_back(beg, end); \ + } + + GEO_BASE_DEF_GEO_DATA + +#undef GEO_BASE_DEF_VAR +#undef GEO_BASE_DEF_ARR + + Sort(segments.begin(), segments.end()); + + for (size_t i = 0; i + 1 < segments.size(); ++i) + if (segments[i].second > segments[i + 1].first) + return false; + + return true; +} + +void NReverseGeocoder::TGeoDataMap::Remap() { + Init(); + + if (!Data_) + return; + + const ui64 headerSize = ntohl(ReadUnaligned<ui64>(Data_)); + + NProto::TGeoData header; + if (!header.ParseFromArray(Data_ + sizeof(ui64), headerSize)) + ythrow yexception() << "Unable parse geoData header"; + + if (header.GetMagic() != SYSTEM_ENDIAN_FLAG) + ythrow yexception() << "Different endianness in geoData and host"; + + if (!CheckMemoryConsistency(header)) + ythrow yexception() << "Memory is not consistent!"; + +#define GEO_BASE_DEF_VAR(TVar, Var) \ + Var##_ = header.Get##Var(); + +#define GEO_BASE_DEF_ARR(TArr, Arr) \ + GEO_BASE_DEF_VAR(TNumber, Arr##Number); \ + if (Arr##Number() > 0) { \ + const intptr_t offset = header.Get##Arr(); \ + Arr##_ = (TArr*)(((intptr_t)Data_) + offset); \ + const ui32 hash = Crc32c(Arr##_, std::min(Arr##Number_, CRC_SIZE) * sizeof(TArr)); \ + if (hash != header.Get##Arr##Crc32()) \ + ythrow yexception() << "Wrong crc32 for " << #Arr; \ + } + + GEO_BASE_DEF_GEO_DATA + +#undef GEO_BASE_DEF_VAR +#undef GEO_BASE_DEF_ARR + + if (Version() != GEO_DATA_CURRENT_VERSION) + ythrow yexception() << "Unable use version " << Version() + << "(current version is " << GEO_DATA_CURRENT_VERSION << ")"; +} + +static size_t HeaderSize() { + NProto::TGeoData header; + header.SetMagic(std::numeric_limits<decltype(header.GetMagic())>::max()); + +#define GEO_BASE_DEF_VAR(TVar, Var) \ + header.Set##Var(std::numeric_limits<decltype(header.Get##Var())>::max()); + +#define GEO_BASE_DEF_ARR(TArr, Arr) \ + GEO_BASE_DEF_VAR(TNumber, Arr##Number); \ + header.Set##Arr(std::numeric_limits<decltype(header.Get##Arr())>::max()); \ + header.Set##Arr##Crc32(std::numeric_limits<decltype(header.Get##Arr##Crc32())>::max()); + + GEO_BASE_DEF_GEO_DATA + +#undef GEO_BASE_DEF_VAR +#undef GEO_BASE_DEF_ARR + + return header.ByteSize(); +} + +static const char* Serialize(const IGeoData& g, TBlockAllocator* allocator, size_t* size) { + size_t const preAllocatedSize = allocator->TotalAllocatedSize(); + char* data = (char*)allocator->Allocate(HeaderSize() + sizeof(ui64)); + + NProto::TGeoData header; + header.SetMagic(SYSTEM_ENDIAN_FLAG); + +#define GEO_BASE_DEF_VAR(TVar, Var) \ + header.Set##Var(g.Var()); + +#define GEO_BASE_DEF_ARR(TArr, Arr) \ + GEO_BASE_DEF_VAR(TNumber, Arr##Number); \ + if (g.Arr##Number() > 0) { \ + TArr* arr = (TArr*)allocator->Allocate(sizeof(TArr) * g.Arr##Number()); \ + memcpy(arr, g.Arr(), sizeof(TArr) * g.Arr##Number()); \ + header.Set##Arr((ui64)(((intptr_t)arr) - ((intptr_t)data))); \ + header.Set##Arr##Crc32(Crc32c(arr, std::min(g.Arr##Number(), CRC_SIZE) * sizeof(TArr))); \ + }; + + GEO_BASE_DEF_GEO_DATA + +#undef GEO_BASE_DEF_VAR +#undef GEO_BASE_DEF_ARR + + const auto str = header.SerializeAsString(); + WriteUnaligned<ui64>(data, (ui64)htonl(str.size())); + memcpy(data + sizeof(ui64), str.data(), str.size()); + + if (size) + *size = allocator->TotalAllocatedSize() - preAllocatedSize; + + return data; +} + +static size_t TotalByteSize(const IGeoData& g) { + size_t total_size = TBlockAllocator::AllocateSize(HeaderSize() + sizeof(ui64)); + +#define GEO_BASE_DEF_VAR(TVar, Var) \ + // undef + +#define GEO_BASE_DEF_ARR(TArr, Arr) \ + total_size += TBlockAllocator::AllocateSize(sizeof(TArr) * g.Arr##Number()); + + GEO_BASE_DEF_GEO_DATA + +#undef GEO_BASE_DEF_VAR +#undef GEO_BASE_DEF_ARR + + return total_size; +} + +NReverseGeocoder::TGeoDataMap::TGeoDataMap(const IGeoData& geoData, TBlockAllocator* allocator) + : TGeoDataMap() +{ + Data_ = Serialize(geoData, allocator, &Size_); + Remap(); +} + +void NReverseGeocoder::TGeoDataMap::SerializeToFile(const TString& path, const IGeoData& data) { + TBlob data_blob = SerializeToBlob(data); + + TFile file(path, CreateAlways | RdWr); + file.Write(data_blob.Data(), data_blob.Length()); +} + +TBlob NReverseGeocoder::TGeoDataMap::SerializeToBlob(const IGeoData& data) { + TBuffer buf; + buf.Resize(TotalByteSize(data)); + memset(buf.data(), 0, buf.size()); + + TBlockAllocator allocator(buf.Data(), buf.Size()); + TGeoDataMap(data, &allocator); + + return TBlob::FromBuffer(buf); +} diff --git a/library/cpp/reverse_geocoder/core/geo_data/map.h b/library/cpp/reverse_geocoder/core/geo_data/map.h new file mode 100644 index 0000000000..e466bd912e --- /dev/null +++ b/library/cpp/reverse_geocoder/core/geo_data/map.h @@ -0,0 +1,89 @@ +#pragma once + +#include "geo_data.h" + +#include <library/cpp/reverse_geocoder/library/block_allocator.h> + +#include <util/memory/blob.h> + +namespace NReverseGeocoder { + class TGeoDataMap: public IGeoData, public TNonCopyable { +#define GEO_BASE_DEF_VAR(TVar, Var) \ +public: \ + const TVar& Var() const override { \ + return Var##_; \ + } \ + \ +private: \ + TVar Var##_; + +#define GEO_BASE_DEF_ARR(TArr, Arr) \ +public: \ + const TArr* Arr() const override { \ + return Arr##_; \ + } \ + TNumber Arr##Number() const override { \ + return Arr##Number_; \ + } \ + \ +private: \ + TNumber Arr##Number_; \ + const TArr* Arr##_; + + GEO_BASE_DEF_GEO_DATA + +#undef GEO_BASE_DEF_VAR +#undef GEO_BASE_DEF_ARR + + public: + TGeoDataMap(); + + static void SerializeToFile(const TString& path, const IGeoData& data); + + static TBlob SerializeToBlob(const IGeoData& data); + + TGeoDataMap(const IGeoData& data, TBlockAllocator* allocator); + + TGeoDataMap(const char* data, size_t size) + : TGeoDataMap() + { + Data_ = data; + Size_ = size; + Remap(); + } + + TGeoDataMap(TGeoDataMap&& dat) + : TGeoDataMap() + { + DoSwap(Data_, dat.Data_); + DoSwap(Size_, dat.Size_); + Remap(); + dat.Remap(); + } + + TGeoDataMap& operator=(TGeoDataMap&& dat) { + DoSwap(Data_, dat.Data_); + DoSwap(Size_, dat.Size_); + Remap(); + dat.Remap(); + return *this; + } + + const char* Data() const { + return Data_; + } + + size_t Size() const { + return Size_; + } + + private: + void Init(); + + void Remap(); + + const char* Data_; + size_t Size_; + }; + +} diff --git a/library/cpp/reverse_geocoder/core/geo_data/proxy.cpp b/library/cpp/reverse_geocoder/core/geo_data/proxy.cpp new file mode 100644 index 0000000000..5ff2d13783 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/geo_data/proxy.cpp @@ -0,0 +1 @@ +#include "proxy.h" diff --git a/library/cpp/reverse_geocoder/core/geo_data/proxy.h b/library/cpp/reverse_geocoder/core/geo_data/proxy.h new file mode 100644 index 0000000000..fecb9fc7cf --- /dev/null +++ b/library/cpp/reverse_geocoder/core/geo_data/proxy.h @@ -0,0 +1,68 @@ +#pragma once + +#include "geo_data.h" +#include "map.h" + +#include <util/generic/ptr.h> +#include <util/system/filemap.h> + +namespace NReverseGeocoder { + class IGeoDataProxy { + public: + virtual const IGeoData* GeoData() const = 0; + + virtual ~IGeoDataProxy() { + } + }; + + using TGeoDataProxyPtr = THolder<IGeoDataProxy>; + + class TGeoDataMapProxy: public IGeoDataProxy, public TNonCopyable { + public: + explicit TGeoDataMapProxy(const char* path) + : MemFile_(path) + { + MemFile_.Map(0, MemFile_.Length()); + GeoData_ = TGeoDataMap((const char*)MemFile_.Ptr(), MemFile_.MappedSize()); + } + + const IGeoData* GeoData() const override { + return &GeoData_; + } + + private: + TFileMap MemFile_; + TGeoDataMap GeoData_; + }; + + class TGeoDataWrapper: public IGeoDataProxy, public TNonCopyable { + public: + explicit TGeoDataWrapper(const IGeoData& g) + : GeoData_(&g) + { + } + + const IGeoData* GeoData() const override { + return GeoData_; + } + + private: + const IGeoData* GeoData_; + }; + + class TGeoDataRawProxy: public IGeoDataProxy, public TNonCopyable { + public: + TGeoDataRawProxy(const char* data, size_t dataSize) + : GeoData_(data, dataSize) + { + } + + const IGeoData* GeoData() const override { + return &GeoData_; + } + + private: + TGeoDataMap GeoData_; + }; + +} diff --git a/library/cpp/reverse_geocoder/core/kv.cpp b/library/cpp/reverse_geocoder/core/kv.cpp new file mode 100644 index 0000000000..a48e9c947e --- /dev/null +++ b/library/cpp/reverse_geocoder/core/kv.cpp @@ -0,0 +1 @@ +#include "kv.h" diff --git a/library/cpp/reverse_geocoder/core/kv.h b/library/cpp/reverse_geocoder/core/kv.h new file mode 100644 index 0000000000..639c21de52 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/kv.h @@ -0,0 +1,13 @@ +#pragma once + +#include "common.h" + +namespace NReverseGeocoder { + // k and v is offsets on blobs in geographical data blobs array. See geo_data.h + // for details. + struct TKv { + TNumber K; + TNumber V; + }; + +} diff --git a/library/cpp/reverse_geocoder/core/location.cpp b/library/cpp/reverse_geocoder/core/location.cpp new file mode 100644 index 0000000000..b2d2f54d12 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/location.cpp @@ -0,0 +1 @@ +#include "location.h" diff --git a/library/cpp/reverse_geocoder/core/location.h b/library/cpp/reverse_geocoder/core/location.h new file mode 100644 index 0000000000..5aa3198684 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/location.h @@ -0,0 +1,21 @@ +#pragma once + +namespace NReverseGeocoder { + struct TLocation { + double Lon; + double Lat; + + TLocation() + : Lon(0) + , Lat(0) + { + } + + TLocation(double lon, double lat) + : Lon(lon) + , Lat(lat) + { + } + }; + +} diff --git a/library/cpp/reverse_geocoder/core/part.cpp b/library/cpp/reverse_geocoder/core/part.cpp new file mode 100644 index 0000000000..c973d2171a --- /dev/null +++ b/library/cpp/reverse_geocoder/core/part.cpp @@ -0,0 +1,29 @@ +#include "part.h" + +#include <library/cpp/reverse_geocoder/library/unaligned_iter.h> + +#include <util/generic/algorithm.h> + +using namespace NReverseGeocoder; + +bool NReverseGeocoder::TPart::Contains(const TPoint& point, TNumber edgeRefsNumber, const TRef* edgeRefs, + const TEdge* edges, const TPoint* points) const { + auto edgeRefsBegin = UnalignedIter(edgeRefs) + EdgeRefsOffset; + auto edgeRefsEnd = edgeRefsBegin + edgeRefsNumber; + + // Find lower bound edge, which lying below given point. + auto cmp = [&](const TRef& e, const TPoint& p) { + return edges[e].Lower(p, points); + }; + + auto edgeRef = LowerBound(edgeRefsBegin, edgeRefsEnd, point, cmp); + + if (edgeRef == edgeRefsEnd) + return false; + + if (edges[*edgeRef].Contains(point, points)) + return true; + + // If the point is inside of the polygon then it will intersect the edge an odd number of times. + return (edgeRef - edgeRefsBegin) % 2 == 1; +} diff --git a/library/cpp/reverse_geocoder/core/part.h b/library/cpp/reverse_geocoder/core/part.h new file mode 100644 index 0000000000..9b24fee96f --- /dev/null +++ b/library/cpp/reverse_geocoder/core/part.h @@ -0,0 +1,26 @@ +#pragma once + +#include "common.h" +#include "edge.h" +#include "point.h" + +namespace NReverseGeocoder { + // TPart contains version of persistent scanline. Parts lying in geofraphical data parts array, + // ordered by Coordinate for each polygon. Variable EdgeRefsOffset refers on EdgeRefs array for + // this part. For optimal usage of memory, part does not contain "EdgeRefsNumber" variable, because + // it's can be computed as parts[i + 1].EdgeRefsOffset - parts[i].EdgeRefsOffset for every part + // in geographical data. Especially for this, added fake part into IGeoData with correct + // EdgeRefsOffset. Refs in EdgeRefs are in increasing order for each part. It is necessary to + // quickly determine how many edges is under the point. See generator/ for details. + struct Y_PACKED TPart { + TCoordinate Coordinate; + TNumber EdgeRefsOffset; + + // Checks point lying under odd numbers of edges or on edge. + bool Contains(const TPoint& point, TNumber edgeRefsNumber, const TRef* edgeRefs, + const TEdge* edges, const TPoint* points) const; + }; + + static_assert(sizeof(TPart) == 8, "NReverseGeocoder::TPart size mismatch"); + +} diff --git a/library/cpp/reverse_geocoder/core/point.cpp b/library/cpp/reverse_geocoder/core/point.cpp new file mode 100644 index 0000000000..396e27e596 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/point.cpp @@ -0,0 +1 @@ +#include "point.h" diff --git a/library/cpp/reverse_geocoder/core/point.h b/library/cpp/reverse_geocoder/core/point.h new file mode 100644 index 0000000000..75f1dfc1b4 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/point.h @@ -0,0 +1,52 @@ +#pragma once + +#include "common.h" +#include "location.h" + +namespace NReverseGeocoder { + struct Y_PACKED TPoint { + TCoordinate X; + TCoordinate Y; + + TPoint() + : X(0) + , Y(0) + { + } + + TPoint(const TCoordinate& x1, const TCoordinate& y1) + : X(x1) + , Y(y1) + { + } + + explicit TPoint(const TLocation& l) + : X(ToCoordinate(l.Lon)) + , Y(ToCoordinate(l.Lat)) + { + } + + TPoint operator-(const TPoint& p) const { + return TPoint(X - p.X, Y - p.Y); + } + + bool operator==(const TPoint& b) const { + return X == b.X && Y == b.Y; + } + + bool operator!=(const TPoint& b) const { + return X != b.X || Y != b.Y; + } + + bool operator<(const TPoint& b) const { + return X < b.X || (X == b.X && Y < b.Y); + } + + TSquare Cross(const TPoint& p) const { + return 1ll * X * p.Y - 1ll * Y * p.X; + } + }; + + static_assert(sizeof(TPoint) == 8, "NReverseGeocoder::TPoint size mismatch"); + +} diff --git a/library/cpp/reverse_geocoder/core/polygon.cpp b/library/cpp/reverse_geocoder/core/polygon.cpp new file mode 100644 index 0000000000..2baac2d229 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/polygon.cpp @@ -0,0 +1,91 @@ +#include "polygon.h" + +#include <util/generic/algorithm.h> + +using namespace NReverseGeocoder; + +static bool Check(const TPart* part, const TPoint& point, const TRef* edgeRefs, + const TEdge* edges, const TPoint* points) { + const TNumber edgeRefsNumber = (part + 1)->EdgeRefsOffset - part->EdgeRefsOffset; + return part->Contains(point, edgeRefsNumber, edgeRefs, edges, points); +} + +bool NReverseGeocoder::TPolygon::Contains(const TPoint& point, const TPart* parts, const TRef* edgeRefs, + const TEdge* edges, const TPoint* points) const { + if (!Bbox.Contains(point)) + return false; + + parts += PartsOffset; + const TPart* partsEnd = parts + PartsNumber; + + // Find lower bound part, which can contains given point. + const TPart* part = LowerBound(parts, partsEnd, point, [&](const TPart& a, const TPoint& b) { + return a.Coordinate < b.X; + }); + + if (part->Coordinate > point.X) { + if (part == parts) + return false; + --part; + } + + if (point.X < part->Coordinate || point.X > (part + 1)->Coordinate) + return false; + + if (point.X == part->Coordinate) + if (part != parts && Check(part - 1, point, edgeRefs, edges, points)) + return true; + + return Check(part, point, edgeRefs, edges, points); +} + +bool NReverseGeocoder::TPolygonBase::Better(const TPolygonBase& p, const TRegion* regions, + TNumber regionsNumber) const { + if (Square < p.Square) + return true; + + if (Square == p.Square) { + const TRegion* begin = regions; + const TRegion* end = regions + regionsNumber; + + const TRegion* r1 = LowerBound(begin, end, TGeoId(RegionId)); + const TRegion* r2 = LowerBound(begin, end, TGeoId(p.RegionId)); + + if (r1 == end || r1->RegionId != RegionId) + return false; + + if (r2 == end || r2->RegionId != p.RegionId) + return false; + + return r1->Better(*r2); + } + + return false; +} + +bool NReverseGeocoder::TRawPolygon::Contains(const TPoint& point, const TRef* edgeRefs, const TEdge* edges, + const TPoint* points) const { + if (!Bbox.Contains(point)) + return false; + + edgeRefs += EdgeRefsOffset; + + TNumber intersections = 0; + for (TNumber i = 0; i < EdgeRefsNumber; ++i) { + const TEdge& e = edges[edgeRefs[i]]; + + if (e.Contains(point, points)) + return true; + + TPoint a = points[e.Beg]; + TPoint b = points[e.End]; + + if (a.X > b.X) + DoSwap(a, b); + + if (a.X < point.X && b.X >= point.X && e.Lower(point, points)) + ++intersections; + } + + return intersections % 2 == 1; +} diff --git a/library/cpp/reverse_geocoder/core/polygon.h b/library/cpp/reverse_geocoder/core/polygon.h new file mode 100644 index 0000000000..065bba1e38 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/polygon.h @@ -0,0 +1,73 @@ +#pragma once + +#include "bbox.h" +#include "common.h" +#include "edge.h" +#include "part.h" +#include "point.h" +#include "region.h" + +namespace NReverseGeocoder { +#pragma pack(push, 1) + + struct TPolygonBase { + enum EType { + TYPE_UNKNOWN = 0, + TYPE_INNER = 1, + TYPE_OUTER = 2, + }; + + // If TYPE_INNER and polygon contains given point, this means that region with RegionId + // does not contains point. + EType Type; + + ui32 Unused1; + + // Geographical data indetifiers. + TGeoId RegionId; + TGeoId PolygonId; + + // Rectangle in which lies that polygon. + TBoundingBox Bbox; + + // Square of polygon. Need for determine which polygon is better. See better member function. + TSquare Square; + + // Total points number of given polygon. + TNumber PointsNumber; + + // Check that this polygon better then given polygon, which means that this polygons lying + // deeper then given in polygons hierarchy. + bool Better(const TPolygonBase& p, const TRegion* regions, TNumber regionsNumber) const; + }; + + // Polygon is a representation of persistent scanline data structure. + struct TPolygon: public TPolygonBase { + // Versions of persistent scanline. + TNumber PartsOffset; + TNumber PartsNumber; + ui32 Unused2; + + // Fast point in polygon test using persistent scanline. You can see how this data structure + // generated in generator/. + bool Contains(const TPoint& point, const TPart* parts, const TRef* edgeRefs, + const TEdge* edges, const TPoint* points) const; + }; + + static_assert(sizeof(TPolygon) == 64, "NReverseGeocoder::TPolygon size mismatch"); + + // Raw polygon is a polygon representation for slow tests. + struct TRawPolygon: public TPolygonBase { + // Raw polygon edge refs. + TNumber EdgeRefsOffset; + TNumber EdgeRefsNumber; + ui32 Unused2; + + bool Contains(const TPoint& point, const TRef* edgeRefs, const TEdge* edges, + const TPoint* points) const; + }; + + static_assert(sizeof(TRawPolygon) == 64, "NReverseGeocoder::TRawPolygon size mismatch"); + +#pragma pack(pop) +} diff --git a/library/cpp/reverse_geocoder/core/region.cpp b/library/cpp/reverse_geocoder/core/region.cpp new file mode 100644 index 0000000000..62b4acd0a1 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/region.cpp @@ -0,0 +1 @@ +#include "region.h" diff --git a/library/cpp/reverse_geocoder/core/region.h b/library/cpp/reverse_geocoder/core/region.h new file mode 100644 index 0000000000..4b010c7103 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/region.h @@ -0,0 +1,37 @@ +#pragma once + +#include "common.h" + +namespace NReverseGeocoder { + struct Y_PACKED TRegion { + TGeoId RegionId; + TNumber KvsOffset; + TNumber KvsNumber; + TSquare Square; + TNumber PolygonsNumber; + ui32 Unused; + + bool operator==(const TRegion& r) const { + return RegionId == r.RegionId; + } + + bool operator<(const TRegion& r) const { + return RegionId < r.RegionId; + } + + bool operator<(const TGeoId& r) const { + return RegionId < r; + } + + friend bool operator<(const TGeoId& regionId, const TRegion& r) { + return regionId < r.RegionId; + } + + bool Better(const TRegion& r) const { + return Square < r.Square; + } + }; + + static_assert(sizeof(TRegion) == 32, "NReverseGeocoder::TRegion size mismatch"); + +} diff --git a/library/cpp/reverse_geocoder/core/reverse_geocoder.cpp b/library/cpp/reverse_geocoder/core/reverse_geocoder.cpp new file mode 100644 index 0000000000..d73e4f2648 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/reverse_geocoder.cpp @@ -0,0 +1,182 @@ +#include "reverse_geocoder.h" +#include "geo_data/geo_data.h" + +#include <library/cpp/reverse_geocoder/library/unaligned_iter.h> + +#include <util/generic/algorithm.h> +#include <util/system/unaligned_mem.h> + +using namespace NReverseGeocoder; + +static bool PolygonContains(const TPolygon& p, const TPoint& point, const IGeoData& geoData) { + const TPart* parts = geoData.Parts(); + const TRef* edgeRefs = geoData.EdgeRefs(); + const TEdge* edges = geoData.Edges(); + const TPoint* points = geoData.Points(); + return p.Contains(point, parts, edgeRefs, edges, points); +} + +template <typename TAnswer> +static void UpdateAnswer(const TAnswer** answer, const TAnswer& polygon, + const IGeoData& geoData) { + if (!*answer) { + *answer = &polygon; + } else { + const TRegion* regions = geoData.Regions(); + const TNumber regionsNumber = geoData.RegionsNumber(); + if (!(*answer)->Better(polygon, regions, regionsNumber)) + *answer = &polygon; + } +} + +static void SortDebug(TReverseGeocoder::TDebug* debug, const IGeoData& geoData) { + const TRegion* regions = geoData.Regions(); + const TNumber regionsNumber = geoData.RegionsNumber(); + + auto cmp = [&](const TGeoId& a, const TGeoId& b) { + const TRegion* r1 = LowerBound(regions, regions + regionsNumber, a); + const TRegion* r2 = LowerBound(regions, regions + regionsNumber, b); + return r1->Better(*r2); + }; + + Sort(debug->begin(), debug->end(), cmp); +} + +TGeoId NReverseGeocoder::TReverseGeocoder::Lookup(const TLocation& location, TDebug* debug) const { + const IGeoData& geoData = *GeoDataProxy_->GeoData(); + + if (debug) + debug->clear(); + + const TPoint point(location); + const TRef boxRef = LookupAreaBox(point); + + if (boxRef >= geoData.BoxesNumber()) + return UNKNOWN_GEO_ID; + + const TNumber refsOffset = geoData.Boxes()[boxRef].PolygonRefsOffset; + const TNumber refsNumber = geoData.Boxes()[boxRef].PolygonRefsNumber; + + const TPolygon* answer = nullptr; + + const TPolygon* p = geoData.Polygons(); + const auto refsBegin = UnalignedIter(geoData.PolygonRefs()) + refsOffset; + const auto refsEnd = refsBegin + refsNumber; + + for (auto iterL = refsBegin, iterR = refsBegin; iterL < refsEnd; iterL = iterR) { + iterR = iterL + 1; + + if (PolygonContains(p[*iterL], point, geoData)) { + if (p[*iterL].Type == TPolygon::TYPE_INNER) { + // All polygons with same RegionId must be skipped if polygon is inner. + // In geoData small inner polygons stored before big outer polygons. + while (iterR < refsEnd && p[*iterL].RegionId == p[*iterR].RegionId) + ++iterR; + + } else { + UpdateAnswer(&answer, p[*iterL], geoData); + + if (debug) + debug->push_back(p[*iterL].RegionId); + + while (iterR < refsEnd && p[*iterL].RegionId == p[*iterR].RegionId) + ++iterR; + } + } + } + + if (debug) + SortDebug(debug, geoData); + + return answer ? answer->RegionId : UNKNOWN_GEO_ID; +} + +TGeoId NReverseGeocoder::TReverseGeocoder::RawLookup(const TLocation& location, TDebug* debug) const { + const IGeoData& geoData = *GeoDataProxy_->GeoData(); + + if (debug) + debug->clear(); + + const TPoint point(location); + + const TRawPolygon* borders = geoData.RawPolygons(); + const TNumber bordersNumber = geoData.RawPolygonsNumber(); + + const TRawPolygon* answer = nullptr; + + TNumber i = 0; + while (i < bordersNumber) { + if (borders[i].Contains(point, geoData.RawEdgeRefs(), geoData.Edges(), geoData.Points())) { + if (borders[i].Type == TRawPolygon::TYPE_INNER) { + TNumber j = i + 1; + while (j < bordersNumber && borders[i].RegionId == borders[j].RegionId) + ++j; + + i = j; + + } else { + UpdateAnswer(&answer, borders[i], geoData); + + if (debug) + debug->push_back(borders[i].RegionId); + + TNumber j = i + 1; + while (j < bordersNumber && borders[i].RegionId == borders[j].RegionId) + ++j; + + i = j; + } + } else { + ++i; + } + } + + if (debug) + SortDebug(debug, geoData); + + return answer ? answer->RegionId : UNKNOWN_GEO_ID; +} + +bool NReverseGeocoder::TReverseGeocoder::EachKv(TGeoId regionId, TKvCallback callback) const { + const IGeoData& g = *GeoDataProxy_->GeoData(); + + const TRegion* begin = g.Regions(); + const TRegion* end = begin + g.RegionsNumber(); + + const TRegion* region = LowerBound(begin, end, regionId); + + if (region == end || region->RegionId != regionId) + return false; + + const TKv* kvs = g.Kvs() + region->KvsOffset; + const char* blobs = g.Blobs(); + + for (TNumber i = 0; i < region->KvsNumber; ++i) { + const char* k = blobs + kvs[i].K; + const char* v = blobs + kvs[i].V; + callback(k, v); + } + + return true; +} + +void NReverseGeocoder::TReverseGeocoder::EachPolygon(TPolygonCallback callback) const { + const IGeoData& g = *GeoDataProxy_->GeoData(); + + for (TNumber i = 0; i < g.PolygonsNumber(); ++i) + callback(g.Polygons()[i]); +} + +void NReverseGeocoder::TReverseGeocoder::EachPart(const TPolygon& polygon, TPartCallback callback) const { + const IGeoData& g = *GeoDataProxy_->GeoData(); + + const TNumber partsOffset = polygon.PartsOffset; + const TNumber partsNumber = polygon.PartsNumber; + + for (TNumber i = partsOffset; i < partsOffset + partsNumber; ++i) { + const TPart& part = g.Parts()[i]; + const TPart& npart = g.Parts()[i + 1]; + const TNumber edgeRefsNumber = npart.EdgeRefsOffset - part.EdgeRefsOffset; + callback(part, edgeRefsNumber); + } +} diff --git a/library/cpp/reverse_geocoder/core/reverse_geocoder.h b/library/cpp/reverse_geocoder/core/reverse_geocoder.h new file mode 100644 index 0000000000..c74eddb40e --- /dev/null +++ b/library/cpp/reverse_geocoder/core/reverse_geocoder.h @@ -0,0 +1,73 @@ +#pragma once + +#include "common.h" +#include "geo_data/geo_data.h" +#include "geo_data/proxy.h" + +#include <util/generic/noncopyable.h> +#include <util/generic/vector.h> + +#include <functional> + +namespace NReverseGeocoder { + const TGeoId UNKNOWN_GEO_ID = static_cast<TGeoId>(-1); + + // NOTE: Be careful! It's work fine and fast on real world dataset. + // But in theory it's can spent O(n^2) memory (on real world dataset it's just 6n). + // Point in polygon test will be O(log n) always. Memory spent will be O(n) in future! + class TReverseGeocoder: public TNonCopyable { + public: + using TDebug = TVector<TGeoId>; + using TKvCallback = std::function<void(const char*, const char*)>; + using TPolygonCallback = std::function<void(const TPolygon&)>; + using TPartCallback = std::function<void(const TPart&, TNumber)>; + + TReverseGeocoder() + : GeoDataProxy_() + { + } + + TReverseGeocoder(TReverseGeocoder&& g) + : GeoDataProxy_() + { + DoSwap(GeoDataProxy_, g.GeoDataProxy_); + } + + TReverseGeocoder& operator=(TReverseGeocoder&& g) { + DoSwap(GeoDataProxy_, g.GeoDataProxy_); + return *this; + } + + explicit TReverseGeocoder(const char* path) + : GeoDataProxy_(new TGeoDataMapProxy(path)) + { + } + + explicit TReverseGeocoder(const IGeoData& geoData) + : GeoDataProxy_(new TGeoDataWrapper(geoData)) + { + } + + TReverseGeocoder(const char* data, size_t dataSize) + : GeoDataProxy_(new TGeoDataRawProxy(data, dataSize)) + { + } + + TGeoId Lookup(const TLocation& location, TDebug* debug = nullptr) const; + + TGeoId RawLookup(const TLocation& location, TDebug* debug = nullptr) const; + + bool EachKv(TGeoId regionId, TKvCallback callback) const; + + void EachPolygon(TPolygonCallback callback) const; + + void EachPart(const TPolygon& polygon, TPartCallback callback) const; + + const IGeoData& GeoData() const { + return *GeoDataProxy_->GeoData(); + } + + private: + TGeoDataProxyPtr GeoDataProxy_; + }; +} diff --git a/library/cpp/reverse_geocoder/core/ya.make b/library/cpp/reverse_geocoder/core/ya.make new file mode 100644 index 0000000000..9f7dc67464 --- /dev/null +++ b/library/cpp/reverse_geocoder/core/ya.make @@ -0,0 +1,28 @@ +LIBRARY() + +PEERDIR( + library/cpp/reverse_geocoder/library + library/cpp/reverse_geocoder/proto + library/cpp/digest/crc32c +) + +SRCS( + area_box.cpp + bbox.cpp + common.cpp + edge.cpp + reverse_geocoder.cpp + kv.cpp + location.cpp + part.cpp + point.cpp + polygon.cpp + region.cpp + geo_data/debug.cpp + geo_data/def.cpp + geo_data/geo_data.cpp + geo_data/map.cpp + geo_data/proxy.cpp +) + +END() diff --git a/library/cpp/reverse_geocoder/library/CMakeLists.darwin-x86_64.txt b/library/cpp/reverse_geocoder/library/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..f82b4b8cd1 --- /dev/null +++ b/library/cpp/reverse_geocoder/library/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,21 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-reverse_geocoder-library) +target_link_libraries(cpp-reverse_geocoder-library PUBLIC + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-reverse_geocoder-library PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/block_allocator.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/fs.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/log.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/pool_allocator.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/unaligned_iter.cpp +) diff --git a/library/cpp/reverse_geocoder/library/CMakeLists.linux-aarch64.txt b/library/cpp/reverse_geocoder/library/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..4b45fce452 --- /dev/null +++ b/library/cpp/reverse_geocoder/library/CMakeLists.linux-aarch64.txt @@ -0,0 +1,22 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-reverse_geocoder-library) +target_link_libraries(cpp-reverse_geocoder-library PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-reverse_geocoder-library PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/block_allocator.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/fs.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/log.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/pool_allocator.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/unaligned_iter.cpp +) diff --git a/library/cpp/reverse_geocoder/library/CMakeLists.linux-x86_64.txt b/library/cpp/reverse_geocoder/library/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..4b45fce452 --- /dev/null +++ b/library/cpp/reverse_geocoder/library/CMakeLists.linux-x86_64.txt @@ -0,0 +1,22 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-reverse_geocoder-library) +target_link_libraries(cpp-reverse_geocoder-library PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-reverse_geocoder-library PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/block_allocator.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/fs.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/log.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/pool_allocator.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/unaligned_iter.cpp +) diff --git a/library/cpp/reverse_geocoder/library/CMakeLists.txt b/library/cpp/reverse_geocoder/library/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/library/cpp/reverse_geocoder/library/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/library/cpp/reverse_geocoder/library/CMakeLists.windows-x86_64.txt b/library/cpp/reverse_geocoder/library/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..f82b4b8cd1 --- /dev/null +++ b/library/cpp/reverse_geocoder/library/CMakeLists.windows-x86_64.txt @@ -0,0 +1,21 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-reverse_geocoder-library) +target_link_libraries(cpp-reverse_geocoder-library PUBLIC + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-reverse_geocoder-library PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/block_allocator.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/fs.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/log.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/pool_allocator.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/unaligned_iter.cpp +) diff --git a/library/cpp/reverse_geocoder/library/block_allocator.cpp b/library/cpp/reverse_geocoder/library/block_allocator.cpp new file mode 100644 index 0000000000..56f61dc566 --- /dev/null +++ b/library/cpp/reverse_geocoder/library/block_allocator.cpp @@ -0,0 +1,40 @@ +#include "block_allocator.h" + +using namespace NReverseGeocoder; + +static size_t const MEMORY_IS_USED_FLAG = ~0ull; +static size_t const SIZEOF_SIZE = AlignMemory(sizeof(size_t)); + +void* NReverseGeocoder::TBlockAllocator::Allocate(size_t number) { + number = AlignMemory(number); + if (BytesAllocated_ + number + SIZEOF_SIZE > BytesLimit_) + ythrow yexception() << "Unable allocate memory"; + char* begin = ((char*)Data_) + BytesAllocated_; + char* end = begin + number; + *((size_t*)end) = MEMORY_IS_USED_FLAG; + BytesAllocated_ += number + SIZEOF_SIZE; + return begin; +} + +size_t NReverseGeocoder::TBlockAllocator::AllocateSize(size_t number) { + return AlignMemory(number) + SIZEOF_SIZE; +} + +static void RelaxBlock(char* begin, size_t* number) { + while (*number > 0) { + char* ptr = begin + *number - SIZEOF_SIZE; + if (*((size_t*)ptr) == MEMORY_IS_USED_FLAG) + return; + *number -= *((size_t*)ptr) + SIZEOF_SIZE; + } +} + +void NReverseGeocoder::TBlockAllocator::Deallocate(void* ptr, size_t number) { + number = AlignMemory(number); + char* begin = (char*)ptr; + char* end = begin + number; + if (*((size_t*)end) != MEMORY_IS_USED_FLAG) + ythrow yexception() << "Trying to deallocate not allocated pointer " << ptr; + *((size_t*)end) = number; + RelaxBlock((char*)Data_, &BytesAllocated_); +} diff --git a/library/cpp/reverse_geocoder/library/block_allocator.h b/library/cpp/reverse_geocoder/library/block_allocator.h new file mode 100644 index 0000000000..1189d6b25c --- /dev/null +++ b/library/cpp/reverse_geocoder/library/block_allocator.h @@ -0,0 +1,64 @@ +#pragma once + +#include "memory.h" + +#include <util/generic/yexception.h> + +namespace NReverseGeocoder { + class TBlockAllocator: public TNonCopyable { + public: + TBlockAllocator() + : Data_(nullptr) + , BytesAllocated_(0) + , BytesLimit_(0) + { + } + + TBlockAllocator(void* data, size_t bytesLimit) + : Data_(data) + , BytesAllocated_(0) + , BytesLimit_(bytesLimit) + { + } + + TBlockAllocator(TBlockAllocator&& a) + : TBlockAllocator() + { + DoSwap(Data_, a.Data_); + DoSwap(BytesAllocated_, a.BytesAllocated_); + DoSwap(BytesLimit_, a.BytesLimit_); + } + + TBlockAllocator& operator=(TBlockAllocator&& a) { + DoSwap(Data_, a.Data_); + DoSwap(BytesAllocated_, a.BytesAllocated_); + DoSwap(BytesLimit_, a.BytesLimit_); + return *this; + } + + virtual ~TBlockAllocator() { + } + + virtual void* Allocate(size_t number); + + static size_t AllocateSize(size_t number); + + virtual void Deallocate(void* ptr, size_t number); + + size_t TotalAllocatedSize() const { + return BytesAllocated_; + } + + void Setup(void* data, size_t bytesLimit) { + Data_ = data; + BytesLimit_ = bytesLimit; + BytesAllocated_ = 0; + } + + private: + void* Data_; + size_t BytesAllocated_; + size_t BytesLimit_; + }; + +} diff --git a/library/cpp/reverse_geocoder/library/fs.cpp b/library/cpp/reverse_geocoder/library/fs.cpp new file mode 100644 index 0000000000..98c3b9ef81 --- /dev/null +++ b/library/cpp/reverse_geocoder/library/fs.cpp @@ -0,0 +1,18 @@ +#include "fs.h" + +#include <util/folder/dirut.h> +#include <util/string/split.h> + +namespace NReverseGeocoder { + TVector<TString> GetDataFilesList(const char* input) { + if (IsDir(input)) { + return GetFileListInDirectory<TVector<TString>>(input); + } + + TVector<TString> result; + for (const auto& partIt : StringSplitter(input).Split(',')) { + result.push_back(TString(partIt.Token())); + } + return result; + } +} diff --git a/library/cpp/reverse_geocoder/library/fs.h b/library/cpp/reverse_geocoder/library/fs.h new file mode 100644 index 0000000000..4435f960c8 --- /dev/null +++ b/library/cpp/reverse_geocoder/library/fs.h @@ -0,0 +1,19 @@ +#pragma once + +#include <util/folder/iterator.h> +#include <util/string/vector.h> + +namespace NReverseGeocoder { + template <typename Cont> + Cont GetFileListInDirectory(const char* dirName) { + TDirIterator dirIt(dirName, TDirIterator::TOptions(FTS_LOGICAL)); + Cont dirContent; + for (auto file = dirIt.begin(); file != dirIt.end(); ++file) { + if (strcmp(file->fts_path, dirName)) + dirContent.push_back(file->fts_path); + } + return dirContent; + } + + TVector<TString> GetDataFilesList(const char* input); +} diff --git a/library/cpp/reverse_geocoder/library/log.cpp b/library/cpp/reverse_geocoder/library/log.cpp new file mode 100644 index 0000000000..44e6ddf287 --- /dev/null +++ b/library/cpp/reverse_geocoder/library/log.cpp @@ -0,0 +1,111 @@ +#include "log.h" + +#include <util/datetime/systime.h> +#include <util/generic/yexception.h> +#include <util/system/guard.h> +#include <util/system/mutex.h> + +using namespace NReverseGeocoder; + +static size_t const TIMESTAMP_LIMIT = 32; + +class TLogger { +public: + static TLogger& Inst() { + static TLogger logger; + return logger; + } + + void Setup(IOutputStream& out, ELogLevel level) { + Out_ = &out; + Level_ = level; + } + + void Write(ELogLevel level, const char* message) { + if (level <= Level_) { + TGuard<TMutex> Lock(Lock_); + Out_->Write(message, strlen(message)); + } + } + + IOutputStream& OutputStream() const { + return *Out_; + } + + ELogLevel Level() const { + return Level_; + } + +private: + TLogger() + : Out_() + , Level_(LOG_LEVEL_DISABLE) + { + } + + IOutputStream* Out_; + ELogLevel Level_; + TMutex Lock_; +}; + +ELogLevel NReverseGeocoder::LogLevel() { + return TLogger::Inst().Level(); +} + +void NReverseGeocoder::LogSetup(IOutputStream& out, ELogLevel level) { + TLogger::Inst().Setup(out, level); +} + +IOutputStream& NReverseGeocoder::LogOutputStream() { + return TLogger::Inst().OutputStream(); +} + +static const char* T(char* buffer) { + struct timeval timeVal; + gettimeofday(&timeVal, nullptr); + + struct tm timeInfo; + const time_t sec = timeVal.tv_sec; + localtime_r(&sec, &timeInfo); + + snprintf(buffer, TIMESTAMP_LIMIT, "%02d:%02d:%02d.%06d", + timeInfo.tm_hour, timeInfo.tm_min, timeInfo.tm_sec, (int)timeVal.tv_usec); + + return buffer; +} + +void NReverseGeocoder::LogWrite(ELogLevel level, const char* message) { + if (level > LogLevel()) + return; + + static const char* A[LOG_LEVEL_COUNT] = { + "", // LOG_LEVEL_DISABLE + "\033[90m", // LOG_LEVEL_ERROR + "\033[90m", // LOG_LEVEL_WARNING + "\033[90m", // LOG_LEVEL_INFO + "\033[90m", // LOG_LEVEL_DEBUG + }; + + static const char* B[LOG_LEVEL_COUNT] = { + "", // LOG_LEVEL_DISABLE + "\033[31;1mError\033[0m", // LOG_LEVEL_ERROR + "\033[33;1mWarn\033[0m", // LOG_LEVEL_WARNING + "\033[32;1mInfo\033[0m", // LOG_LEVEL_INFO + "Debug", // LOG_LEVEL_DEBUG + }; + + static const char* C[LOG_LEVEL_COUNT] = { + "", // LOG_LEVEL_DISABLE + "\n", // LOG_LEVEL_ERROR + "\n", // LOG_LEVEL_WARNING + "\n", // LOG_LEVEL_INFO + "\033[0m\n", // LOG_LEVEL_DEBUG + }; + + char buffer[LOG_MESSAGE_LIMIT], tbuffer[TIMESTAMP_LIMIT]; + // Ignore logger snprintf errors. + snprintf(buffer, LOG_MESSAGE_LIMIT, "%s(%s) %s: %s%s", + A[level], T(tbuffer), B[level], message, C[level]); + + TLogger::Inst().Write(level, buffer); +} diff --git a/library/cpp/reverse_geocoder/library/log.h b/library/cpp/reverse_geocoder/library/log.h new file mode 100644 index 0000000000..44cb0cefcf --- /dev/null +++ b/library/cpp/reverse_geocoder/library/log.h @@ -0,0 +1,65 @@ +#pragma once + +#include <util/generic/yexception.h> +#include <util/stream/output.h> + +#include <cstdio> + +namespace NReverseGeocoder { + size_t const LOG_MESSAGE_LIMIT = 1024; + + enum ELogLevel { + LOG_LEVEL_DISABLE = 0, + LOG_LEVEL_ERROR, + LOG_LEVEL_WARNING, + LOG_LEVEL_INFO, + LOG_LEVEL_DEBUG, + LOG_LEVEL_COUNT + }; + + // Init logger. Setup OutputStream and logger level. + void LogSetup(IOutputStream& out, ELogLevel level); + + // Write log message with colors, level and current time. + // Example: + // (13:24:11.123456) Info: Good job! + // (13:24:11.323456) Warn: Ooops :( + // (13:24:22.456789) Error: Hello, world! + void LogWrite(ELogLevel level, const char* message); + + // Log output file descriptor. + IOutputStream& LogOutputStream(); + + // Current log level. + ELogLevel LogLevel(); + + template <typename... TArgs> + void LogWrite(ELogLevel level, const char* fmt, TArgs... args) { + if (level <= LogLevel()) { + char buffer[LOG_MESSAGE_LIMIT]; + // Ignore logger snprintf errors. + snprintf(buffer, LOG_MESSAGE_LIMIT, fmt, std::forward<TArgs>(args)...); + LogWrite(level, buffer); + } + } + + template <typename... TArgs> + void LogError(TArgs... args) { + LogWrite(LOG_LEVEL_ERROR, std::forward<TArgs>(args)...); + } + + template <typename... TArgs> + void LogWarning(TArgs... args) { + LogWrite(LOG_LEVEL_WARNING, std::forward<TArgs>(args)...); + } + + template <typename... TArgs> + void LogInfo(TArgs... args) { + LogWrite(LOG_LEVEL_INFO, std::forward<TArgs>(args)...); + } + + template <typename... TArgs> + void LogDebug(TArgs... args) { + LogWrite(LOG_LEVEL_DEBUG, std::forward<TArgs>(args)...); + } +} diff --git a/library/cpp/reverse_geocoder/library/memory.h b/library/cpp/reverse_geocoder/library/memory.h new file mode 100644 index 0000000000..ecbe8bcd66 --- /dev/null +++ b/library/cpp/reverse_geocoder/library/memory.h @@ -0,0 +1,23 @@ +#pragma once + +#include <util/system/types.h> + +namespace NReverseGeocoder { + constexpr ui64 B = 1ull; + constexpr ui64 KB = 1024 * B; + constexpr ui64 MB = 1024 * KB; + constexpr ui64 GB = 1024 * MB; + + constexpr size_t MEMORY_ALIGNMENT = 16ull; + + inline unsigned long long AlignMemory(unsigned long long x) { + if (x % MEMORY_ALIGNMENT == 0) + return x; + return x + MEMORY_ALIGNMENT - x % MEMORY_ALIGNMENT; + } + + inline bool IsAlignedMemory(void* ptr) { + return ((uintptr_t)ptr) % MEMORY_ALIGNMENT == 0; + } + +} diff --git a/library/cpp/reverse_geocoder/library/pool_allocator.cpp b/library/cpp/reverse_geocoder/library/pool_allocator.cpp new file mode 100644 index 0000000000..0d841f7db0 --- /dev/null +++ b/library/cpp/reverse_geocoder/library/pool_allocator.cpp @@ -0,0 +1,17 @@ +#include "memory.h" +#include "pool_allocator.h" + +#include <util/generic/yexception.h> + +using namespace NReverseGeocoder; + +NReverseGeocoder::TPoolAllocator::TPoolAllocator(size_t poolSize) { + Ptr_ = new char[poolSize]; + Size_ = poolSize; + Setup(Ptr_, Size_); +} + +NReverseGeocoder::TPoolAllocator::~TPoolAllocator() { + if (Ptr_) + delete[] Ptr_; +} diff --git a/library/cpp/reverse_geocoder/library/pool_allocator.h b/library/cpp/reverse_geocoder/library/pool_allocator.h new file mode 100644 index 0000000000..f98bbcd3c1 --- /dev/null +++ b/library/cpp/reverse_geocoder/library/pool_allocator.h @@ -0,0 +1,42 @@ +#pragma once + +#include "block_allocator.h" + +#include <util/generic/utility.h> +#include <util/generic/noncopyable.h> + +namespace NReverseGeocoder { + class TPoolAllocator: public TBlockAllocator { + public: + TPoolAllocator() + : Ptr_(nullptr) + , Size_(0) + { + } + + TPoolAllocator(TPoolAllocator&& a) + : TBlockAllocator(std::forward<TBlockAllocator>(a)) + , Ptr_(nullptr) + , Size_(0) + { + DoSwap(Ptr_, a.Ptr_); + DoSwap(Size_, a.Size_); + } + + TPoolAllocator& operator=(TPoolAllocator&& a) { + TBlockAllocator::operator=(std::forward<TBlockAllocator>(a)); + DoSwap(Ptr_, a.Ptr_); + DoSwap(Size_, a.Size_); + return *this; + } + + explicit TPoolAllocator(size_t poolSize); + + ~TPoolAllocator() override; + + private: + char* Ptr_; + size_t Size_; + }; + +} diff --git a/library/cpp/reverse_geocoder/library/system.h b/library/cpp/reverse_geocoder/library/system.h new file mode 100644 index 0000000000..499fb2bd91 --- /dev/null +++ b/library/cpp/reverse_geocoder/library/system.h @@ -0,0 +1,3 @@ +#pragma once + +#define SYSTEM_ENDIAN_FLAG (htonl(337)) diff --git a/library/cpp/reverse_geocoder/library/unaligned_iter.cpp b/library/cpp/reverse_geocoder/library/unaligned_iter.cpp new file mode 100644 index 0000000000..0322b677dc --- /dev/null +++ b/library/cpp/reverse_geocoder/library/unaligned_iter.cpp @@ -0,0 +1 @@ +#include "unaligned_iter.h" diff --git a/library/cpp/reverse_geocoder/library/unaligned_iter.h b/library/cpp/reverse_geocoder/library/unaligned_iter.h new file mode 100644 index 0000000000..827a3e2fd2 --- /dev/null +++ b/library/cpp/reverse_geocoder/library/unaligned_iter.h @@ -0,0 +1,64 @@ +#pragma once + +#include <util/system/unaligned_mem.h> +#include <iterator> + +namespace NReverseGeocoder { + /** + * Random-access iterator over a read-only memory range + * of trivially copyable items that may be not aligned properly. + * + * When dereferencing, a copy of item is returned, not a reference. + * Be sure that sizeof(T) is small enough. + * + * Iterator is useful for LowerBound/UpperBound STL algorithms. + */ + template <class T> + class TUnalignedIter: public std::iterator<std::random_access_iterator_tag, T> { + public: + using TSelf = TUnalignedIter<T>; + + explicit TUnalignedIter(const T* ptr) + : Ptr(ptr) + { + } + + T operator*() const { + return ReadUnaligned<T>(Ptr); + } + + bool operator==(TSelf other) const { + return Ptr == other.Ptr; + } + + bool operator<(TSelf other) const { + return Ptr < other.Ptr; + } + + TSelf operator+(ptrdiff_t delta) const { + return TSelf{Ptr + delta}; + } + + ptrdiff_t operator-(TSelf other) const { + return Ptr - other.Ptr; + } + + TSelf& operator+=(ptrdiff_t delta) { + Ptr += delta; + return *this; + } + + TSelf& operator++() { + ++Ptr; + return *this; + } + + private: + const T* Ptr; + }; + + template <class T> + TUnalignedIter<T> UnalignedIter(const T* ptr) { + return TUnalignedIter<T>(ptr); + } +} diff --git a/library/cpp/reverse_geocoder/library/ya.make b/library/cpp/reverse_geocoder/library/ya.make new file mode 100644 index 0000000000..ec2eb205a8 --- /dev/null +++ b/library/cpp/reverse_geocoder/library/ya.make @@ -0,0 +1,11 @@ +LIBRARY() + +SRCS( + block_allocator.cpp + fs.cpp + log.cpp + pool_allocator.cpp + unaligned_iter.cpp +) + +END() diff --git a/library/cpp/reverse_geocoder/proto/CMakeLists.darwin-x86_64.txt b/library/cpp/reverse_geocoder/proto/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..8d1df0fdf8 --- /dev/null +++ b/library/cpp/reverse_geocoder/proto/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,56 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +get_built_tool_path( + TOOL_protoc_bin + TOOL_protoc_dependency + contrib/tools/protoc/bin + protoc +) +get_built_tool_path( + TOOL_cpp_styleguide_bin + TOOL_cpp_styleguide_dependency + contrib/tools/protoc/plugins/cpp_styleguide + cpp_styleguide +) +get_built_tool_path( + TOOL_protoc_bin + TOOL_protoc_dependency + contrib/tools/protoc/bin + protoc +) +get_built_tool_path( + TOOL_cpp_styleguide_bin + TOOL_cpp_styleguide_dependency + contrib/tools/protoc/plugins/cpp_styleguide + cpp_styleguide +) + +add_library(cpp-reverse_geocoder-proto) +target_link_libraries(cpp-reverse_geocoder-proto PUBLIC + contrib-libs-cxxsupp + yutil + contrib-libs-protobuf +) +target_proto_messages(cpp-reverse_geocoder-proto PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/proto/geo_data.proto + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/proto/region.proto +) +target_proto_addincls(cpp-reverse_geocoder-proto + ./ + ${CMAKE_SOURCE_DIR}/ + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src +) +target_proto_outs(cpp-reverse_geocoder-proto + --cpp_out=${CMAKE_BINARY_DIR}/ + --cpp_styleguide_out=${CMAKE_BINARY_DIR}/ +) diff --git a/library/cpp/reverse_geocoder/proto/CMakeLists.linux-aarch64.txt b/library/cpp/reverse_geocoder/proto/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..b53c1692ee --- /dev/null +++ b/library/cpp/reverse_geocoder/proto/CMakeLists.linux-aarch64.txt @@ -0,0 +1,57 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +get_built_tool_path( + TOOL_protoc_bin + TOOL_protoc_dependency + contrib/tools/protoc/bin + protoc +) +get_built_tool_path( + TOOL_cpp_styleguide_bin + TOOL_cpp_styleguide_dependency + contrib/tools/protoc/plugins/cpp_styleguide + cpp_styleguide +) +get_built_tool_path( + TOOL_protoc_bin + TOOL_protoc_dependency + contrib/tools/protoc/bin + protoc +) +get_built_tool_path( + TOOL_cpp_styleguide_bin + TOOL_cpp_styleguide_dependency + contrib/tools/protoc/plugins/cpp_styleguide + cpp_styleguide +) + +add_library(cpp-reverse_geocoder-proto) +target_link_libraries(cpp-reverse_geocoder-proto PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + contrib-libs-protobuf +) +target_proto_messages(cpp-reverse_geocoder-proto PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/proto/geo_data.proto + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/proto/region.proto +) +target_proto_addincls(cpp-reverse_geocoder-proto + ./ + ${CMAKE_SOURCE_DIR}/ + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src +) +target_proto_outs(cpp-reverse_geocoder-proto + --cpp_out=${CMAKE_BINARY_DIR}/ + --cpp_styleguide_out=${CMAKE_BINARY_DIR}/ +) diff --git a/library/cpp/reverse_geocoder/proto/CMakeLists.linux-x86_64.txt b/library/cpp/reverse_geocoder/proto/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..b53c1692ee --- /dev/null +++ b/library/cpp/reverse_geocoder/proto/CMakeLists.linux-x86_64.txt @@ -0,0 +1,57 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +get_built_tool_path( + TOOL_protoc_bin + TOOL_protoc_dependency + contrib/tools/protoc/bin + protoc +) +get_built_tool_path( + TOOL_cpp_styleguide_bin + TOOL_cpp_styleguide_dependency + contrib/tools/protoc/plugins/cpp_styleguide + cpp_styleguide +) +get_built_tool_path( + TOOL_protoc_bin + TOOL_protoc_dependency + contrib/tools/protoc/bin + protoc +) +get_built_tool_path( + TOOL_cpp_styleguide_bin + TOOL_cpp_styleguide_dependency + contrib/tools/protoc/plugins/cpp_styleguide + cpp_styleguide +) + +add_library(cpp-reverse_geocoder-proto) +target_link_libraries(cpp-reverse_geocoder-proto PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + contrib-libs-protobuf +) +target_proto_messages(cpp-reverse_geocoder-proto PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/proto/geo_data.proto + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/proto/region.proto +) +target_proto_addincls(cpp-reverse_geocoder-proto + ./ + ${CMAKE_SOURCE_DIR}/ + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src +) +target_proto_outs(cpp-reverse_geocoder-proto + --cpp_out=${CMAKE_BINARY_DIR}/ + --cpp_styleguide_out=${CMAKE_BINARY_DIR}/ +) diff --git a/library/cpp/reverse_geocoder/proto/CMakeLists.txt b/library/cpp/reverse_geocoder/proto/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/library/cpp/reverse_geocoder/proto/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/library/cpp/reverse_geocoder/proto/CMakeLists.windows-x86_64.txt b/library/cpp/reverse_geocoder/proto/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..8d1df0fdf8 --- /dev/null +++ b/library/cpp/reverse_geocoder/proto/CMakeLists.windows-x86_64.txt @@ -0,0 +1,56 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +get_built_tool_path( + TOOL_protoc_bin + TOOL_protoc_dependency + contrib/tools/protoc/bin + protoc +) +get_built_tool_path( + TOOL_cpp_styleguide_bin + TOOL_cpp_styleguide_dependency + contrib/tools/protoc/plugins/cpp_styleguide + cpp_styleguide +) +get_built_tool_path( + TOOL_protoc_bin + TOOL_protoc_dependency + contrib/tools/protoc/bin + protoc +) +get_built_tool_path( + TOOL_cpp_styleguide_bin + TOOL_cpp_styleguide_dependency + contrib/tools/protoc/plugins/cpp_styleguide + cpp_styleguide +) + +add_library(cpp-reverse_geocoder-proto) +target_link_libraries(cpp-reverse_geocoder-proto PUBLIC + contrib-libs-cxxsupp + yutil + contrib-libs-protobuf +) +target_proto_messages(cpp-reverse_geocoder-proto PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/proto/geo_data.proto + ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/proto/region.proto +) +target_proto_addincls(cpp-reverse_geocoder-proto + ./ + ${CMAKE_SOURCE_DIR}/ + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src +) +target_proto_outs(cpp-reverse_geocoder-proto + --cpp_out=${CMAKE_BINARY_DIR}/ + --cpp_styleguide_out=${CMAKE_BINARY_DIR}/ +) diff --git a/library/cpp/reverse_geocoder/proto/geo_data.proto b/library/cpp/reverse_geocoder/proto/geo_data.proto new file mode 100644 index 0000000000..00ecb48bec --- /dev/null +++ b/library/cpp/reverse_geocoder/proto/geo_data.proto @@ -0,0 +1,42 @@ +package NReverseGeocoder.NProto; + +message TGeoData { + required uint64 Magic = 1; + required uint64 Version = 2; + optional uint64 Points = 3; + optional uint64 PointsNumber = 4; + optional uint64 PointsCrc32 = 5; + optional uint64 Edges = 6; + optional uint64 EdgesNumber = 7; + optional uint64 EdgesCrc32 = 8; + optional uint64 EdgeRefs = 9; + optional uint64 EdgeRefsNumber = 10; + optional uint64 EdgeRefsCrc32 = 11; + optional uint64 Parts = 12; + optional uint64 PartsNumber = 13; + optional uint64 PartsCrc32 = 14; + optional uint64 Polygons = 15; + optional uint64 PolygonsNumber = 16; + optional uint64 PolygonsCrc32 = 17; + optional uint64 PolygonRefs = 18; + optional uint64 PolygonRefsNumber = 19; + optional uint64 PolygonRefsCrc32 = 20; + optional uint64 Boxes = 21; + optional uint64 BoxesNumber = 22; + optional uint64 BoxesCrc32 = 23; + optional uint64 Blobs = 24; + optional uint64 BlobsNumber = 25; + optional uint64 BlobsCrc32 = 26; + optional uint64 Kvs = 27; + optional uint64 KvsNumber = 28; + optional uint64 KvsCrc32 = 29; + optional uint64 Regions = 30; + optional uint64 RegionsNumber = 31; + optional uint64 RegionsCrc32 = 32; + optional uint64 RawPolygons = 33; + optional uint64 RawPolygonsNumber = 34; + optional uint64 RawPolygonsCrc32 = 35; + optional uint64 RawEdgeRefs = 36; + optional uint64 RawEdgeRefsNumber = 37; + optional uint64 RawEdgeRefsCrc32 = 38; +}; diff --git a/library/cpp/reverse_geocoder/proto/region.proto b/library/cpp/reverse_geocoder/proto/region.proto new file mode 100644 index 0000000000..b782331628 --- /dev/null +++ b/library/cpp/reverse_geocoder/proto/region.proto @@ -0,0 +1,32 @@ +package NReverseGeocoder.NProto; + +message TLocation { + required double Lat = 1; + required double Lon = 2; +} + +message TPolygon { + required uint64 PolygonId = 1; + repeated TLocation Locations = 2; + + enum EType { + TYPE_UNKNOWN = 0; + TYPE_INNER = 1; + TYPE_OUTER = 2; + } + + required EType Type = 3; +} + +message TKv { + required string K = 1; + required string V = 2; +} + +message TRegion { + required uint64 RegionId = 1; + optional uint64 ParentId = 2; + repeated TPolygon Polygons = 3; + repeated TKv Kvs = 4; + repeated string Blobs = 5; +} diff --git a/library/cpp/reverse_geocoder/proto/ya.make b/library/cpp/reverse_geocoder/proto/ya.make new file mode 100644 index 0000000000..b6f7156210 --- /dev/null +++ b/library/cpp/reverse_geocoder/proto/ya.make @@ -0,0 +1,10 @@ +PROTO_LIBRARY() + +SRCS( + geo_data.proto + region.proto +) + +EXCLUDE_TAGS(GO_PROTO) + +END() diff --git a/library/cpp/robots_txt/CMakeLists.darwin-x86_64.txt b/library/cpp/robots_txt/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..408bf12f04 --- /dev/null +++ b/library/cpp/robots_txt/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,26 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +add_subdirectory(robotstxtcfg) + +add_library(library-cpp-robots_txt) +target_link_libraries(library-cpp-robots_txt PUBLIC + contrib-libs-cxxsupp + yutil + cpp-robots_txt-robotstxtcfg + library-cpp-case_insensitive_string + library-cpp-charset + cpp-string_utils-url + library-cpp-uri +) +target_sources(library-cpp-robots_txt PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/prefix_tree.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/prefix_tree_rules_handler.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robots_txt_parser.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/rules_handler.cpp +) diff --git a/library/cpp/robots_txt/CMakeLists.linux-aarch64.txt b/library/cpp/robots_txt/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..73a209cbbe --- /dev/null +++ b/library/cpp/robots_txt/CMakeLists.linux-aarch64.txt @@ -0,0 +1,27 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +add_subdirectory(robotstxtcfg) + +add_library(library-cpp-robots_txt) +target_link_libraries(library-cpp-robots_txt PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + cpp-robots_txt-robotstxtcfg + library-cpp-case_insensitive_string + library-cpp-charset + cpp-string_utils-url + library-cpp-uri +) +target_sources(library-cpp-robots_txt PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/prefix_tree.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/prefix_tree_rules_handler.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robots_txt_parser.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/rules_handler.cpp +) diff --git a/library/cpp/robots_txt/CMakeLists.linux-x86_64.txt b/library/cpp/robots_txt/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..73a209cbbe --- /dev/null +++ b/library/cpp/robots_txt/CMakeLists.linux-x86_64.txt @@ -0,0 +1,27 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +add_subdirectory(robotstxtcfg) + +add_library(library-cpp-robots_txt) +target_link_libraries(library-cpp-robots_txt PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + cpp-robots_txt-robotstxtcfg + library-cpp-case_insensitive_string + library-cpp-charset + cpp-string_utils-url + library-cpp-uri +) +target_sources(library-cpp-robots_txt PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/prefix_tree.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/prefix_tree_rules_handler.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robots_txt_parser.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/rules_handler.cpp +) diff --git a/library/cpp/robots_txt/CMakeLists.txt b/library/cpp/robots_txt/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/library/cpp/robots_txt/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/library/cpp/robots_txt/CMakeLists.windows-x86_64.txt b/library/cpp/robots_txt/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..408bf12f04 --- /dev/null +++ b/library/cpp/robots_txt/CMakeLists.windows-x86_64.txt @@ -0,0 +1,26 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +add_subdirectory(robotstxtcfg) + +add_library(library-cpp-robots_txt) +target_link_libraries(library-cpp-robots_txt PUBLIC + contrib-libs-cxxsupp + yutil + cpp-robots_txt-robotstxtcfg + library-cpp-case_insensitive_string + library-cpp-charset + cpp-string_utils-url + library-cpp-uri +) +target_sources(library-cpp-robots_txt PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/prefix_tree.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/prefix_tree_rules_handler.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robots_txt_parser.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/rules_handler.cpp +) diff --git a/library/cpp/robots_txt/constants.h b/library/cpp/robots_txt/constants.h new file mode 100644 index 0000000000..e5e2a57e18 --- /dev/null +++ b/library/cpp/robots_txt/constants.h @@ -0,0 +1,9 @@ +#pragma once + +#include <util/generic/size_literals.h> +#include <util/system/defaults.h> + + +constexpr auto robots_max = 500_KB; +constexpr auto max_rules_count = 10'000; +constexpr auto max_rule_length = 10_KB; diff --git a/library/cpp/robots_txt/prefix_tree.cpp b/library/cpp/robots_txt/prefix_tree.cpp new file mode 100644 index 0000000000..f7b1848a43 --- /dev/null +++ b/library/cpp/robots_txt/prefix_tree.cpp @@ -0,0 +1,172 @@ +#include <cstring> +#include <algorithm> + +#include "prefix_tree.h" + +TPrefixTreeNodeElement::TPrefixTreeNodeElement() + : Key(nullptr) + , KeyLen(0) + , Val(-1) + , Index(-1) +{ +} + +TPrefixTreeNodeElement::TPrefixTreeNodeElement(const char* key, i32 keyLen = 0, i32 val = -1, i32 index = -1) + : Key(key) + , KeyLen(keyLen) + , Val(val) + , Index(index) +{ +} + +TPrefixTreeNode::TPrefixTreeNode() + : Elements() +{ +} + +int TPrefixTreeNode::Find(char ch) const { + for (size_t i = 0; i < Elements.size(); ++i) + if (ch == *(Elements[i].Key)) + return i; + return -1; +} + +void TPrefixTreeNode::Set(const char* key, i32 keyLen, i32 val, i32 index) { + TPrefixTreeNodeElement element(key, keyLen, val, index); + int i = Find(*key); + if (i < 0) + Elements.push_back(element); + else + Elements[i] = element; +} + +void TPrefixTreeNode::Dump(FILE* logFile) const { + if (!logFile) + logFile = stderr; + fprintf(logFile, "size=%" PRISZT "\n", Elements.size()); + static char b[1234]; + for (size_t i = 0; i < Elements.size(); ++i) { + strncpy(b, Elements[i].Key, Elements[i].KeyLen); + b[Elements[i].KeyLen] = 0; + fprintf(logFile, "{key=[%s]:%d, val=%d, index=%d}\n", b, Elements[i].KeyLen, Elements[i].Val, Elements[i].Index); + } +} + +void TPrefixTree::Dump(FILE* logFile) const { + if (!logFile) + logFile = stderr; + fprintf(logFile, "%" PRISZT " nodes\n", Nodes.size()); + for (size_t i = 0; i < Nodes.size(); ++i) { + fprintf(logFile, "%" PRISZT ": ", i); + Nodes[i].Dump(logFile); + fprintf(logFile, "\n"); + } +} + +TPrefixTree::TPrefixTree(int maxSize) { + Init(maxSize); +} + +void TPrefixTree::Init(int maxSize) { + Nodes.clear(); + Nodes.reserve(std::max(maxSize + 1, 1)); + Nodes.push_back(TPrefixTreeNode()); +} + +void TPrefixTree::Clear() { + Nodes.clear(); + Init(0); +} + +void TPrefixTree::Add(const char* s, i32 index) { + AddInternal(s, Nodes[0], index); +} + +void TPrefixTree::AddInternal(const char* s, TPrefixTreeNode& node, i32 index) { + if (!s || !*s) + return; + + int i = node.Find(*s); + if (i >= 0) { + TPrefixTreeNodeElement& d = node.Elements[i]; + const char* p = d.Key; + while (*s && (p - d.Key) < d.KeyLen && *s == *p) + ++s, ++p; + + if (*s) { + if ((p - d.Key) < d.KeyLen) { + Nodes.push_back(TPrefixTreeNode()); + Nodes.back().Set(p, d.KeyLen - (p - d.Key), d.Val, d.Index); + Nodes.back().Set(s, strlen(s), -1, index); + + d.Val = Nodes.size() - 1; + d.KeyLen = p - d.Key; + d.Index = INDEX_BOUND; + } else { + if (d.Val != -1 && index < d.Index) + AddInternal(s, Nodes[d.Val], index); + } + } else { + if ((p - d.Key) < d.KeyLen) { + Nodes.push_back(TPrefixTreeNode()); + Nodes.back().Set(p, d.KeyLen - (p - d.Key), d.Val, d.Index); + d.Val = Nodes.size() - 1; + d.KeyLen = p - d.Key; + d.Index = index; + } else { + d.Index = std::min(d.Index, index); + } + } + } else { + node.Set(s, strlen(s), -1, index); + } +} + +int TPrefixTree::GetMemorySize() const { + int res = Nodes.capacity() * sizeof(TPrefixTreeNode); + for (size_t i = 0; i < Nodes.size(); ++i) + res += Nodes[i].Elements.capacity() * sizeof(TPrefixTreeNodeElement); + return res; +} + +void TPrefixTree::Compress() { + Nodes.shrink_to_fit(); + for (size_t i = 0; i < Nodes.size(); ++i) + Nodes[i].Elements.shrink_to_fit(); +} + +i32 TPrefixTree::MinPrefixIndex(const char* s) const { + if (!*s) + return -1; + int i = Nodes[0].Find(*s); + if (i < 0) + return -1; + const TPrefixTreeNodeElement* d = &Nodes[0].Elements[i]; + + const char* p = d->Key; + if (!p || !*p) + return -1; + + i32 result = INDEX_BOUND; + i32 nodeIndex = 0; + while (*s == *p) { + if (++p - d->Key >= d->KeyLen) + result = std::min(result, d->Index); + if (!*++s) + break; + + if (p - d->Key >= d->KeyLen) { + nodeIndex = d->Val; + if (nodeIndex == -1) + break; + i = Nodes[nodeIndex].Find(*s); + if (i < 0) + break; + d = &Nodes[nodeIndex].Elements[i]; + p = d->Key; + if (!p || !*p) + break; + } + } + return result < INDEX_BOUND ? result : -1; +} diff --git a/library/cpp/robots_txt/prefix_tree.h b/library/cpp/robots_txt/prefix_tree.h new file mode 100644 index 0000000000..5feafcb74d --- /dev/null +++ b/library/cpp/robots_txt/prefix_tree.h @@ -0,0 +1,47 @@ +#pragma once + +#include <util/generic/ptr.h> +#include <util/generic/vector.h> +#include <cstdio> +#include <util/generic/noncopyable.h> + +struct TPrefixTreeNodeElement { + const char* Key; + i32 KeyLen; + i32 Val; + i32 Index; + + TPrefixTreeNodeElement(); + TPrefixTreeNodeElement(const char*, i32, i32, i32); +}; + +class TPrefixTreeNode { +public: + TVector<TPrefixTreeNodeElement> Elements; + TPrefixTreeNode(); + + int Find(char) const; + void Set(const char*, i32, i32, i32); + void Dump(FILE*) const; +}; + +class TPrefixTree : TNonCopyable { +private: + static const i32 INDEX_BOUND = 1 << 30; + + TVector<TPrefixTreeNode> Nodes; + +public: + void Init(int); + TPrefixTree(int); + + void Add(const char*, i32); + i32 MinPrefixIndex(const char*) const; + void Clear(); + void Dump(FILE*) const; + int GetMemorySize() const; + void Compress(); + +private: + void AddInternal(const char*, TPrefixTreeNode&, i32); +}; diff --git a/library/cpp/robots_txt/prefix_tree_rules_handler.cpp b/library/cpp/robots_txt/prefix_tree_rules_handler.cpp new file mode 100644 index 0000000000..8dd579d060 --- /dev/null +++ b/library/cpp/robots_txt/prefix_tree_rules_handler.cpp @@ -0,0 +1,706 @@ +#include "robots_txt.h" + +#include <util/digest/fnv.h> +#include <util/system/tls.h> +#include <util/generic/buffer.h> +#include <util/generic/yexception.h> + +namespace { + +TString NormalizeRule(TStringBuf rule) { + TString result; + result.reserve(rule.size() + 1); + + // remove consecutive '*' + for (auto c : rule) { + if (c != '*' || !result.EndsWith('*')) { + result.append(c); + } + } + + if (rule == "*") { + result = "/*"; + return result; + } + + // unify suffix + if (result.EndsWith('$')) { + result.pop_back(); + } else if (!result.EndsWith('*')) { + result.append('*'); + } + + return result; +} + +// Prefix rules +bool IsPrefixRule(TStringBuf rule) { + return rule.EndsWith('*') && !TStringBuf(rule.begin(), rule.end() - 1).Contains('*'); +} + +// Converts rule to internal representation, i.e. +// For prefix rules: "/foo", 'D' -> 'D', "/foo" +// For generic rules: "/*foo", 'D' -> ("/*/*foo*", 'd') or ("/*foo$", 'A') -> ("/*foo", 'a') +// The distinction is in uppercase/lowercase rule type +std::pair<TString, char> ConvertRule(TStringBuf rule, char type) { + switch (type) { + case 'H': + case 'S': + case 'C': + case 'P': + return {TString(rule), type}; + case 'A': + case 'D': + break; + default: + return {{}, type}; + } + + auto result = NormalizeRule(rule); + if (IsPrefixRule(result)) { + result.pop_back(); // remove extra '*' from the end + } else { + type = tolower(type); + } + + return {std::move(result), type}; +} + +} // namespace + +TPrefixTreeRobotsTxtRulesHandler::TPrefixTreeRobotsTxtRulesHandler( + TBotIdSet supportedBotIds, + int robotsMaxSize, + int maxRulesNumber, + bool saveDataForAnyBot) + : TRobotsTxtRulesHandlerBase(supportedBotIds, robotsMaxSize, maxRulesNumber, saveDataForAnyBot) +{} + +TPrefixTreeRobotsTxtRulesHandler::TPrefixTreeRobotsTxtRulesHandler( + std::initializer_list<ui32> supportedBotIds, + int robotsMaxSize, + int maxRulesNumber, + bool saveDataForAnyBot) + : TRobotsTxtRulesHandlerBase(TBotIdSet(supportedBotIds), robotsMaxSize, maxRulesNumber, saveDataForAnyBot) +{} + +TPrefixTreeRobotsTxtRulesHandler::TPrefixTreeRobotsTxtRulesHandler( + const TSet<ui32>& supportedBotIds, + int robotsMaxSize, + int maxRulesNumber, + bool saveDataForAnyBot) + : TRobotsTxtRulesHandlerBase(supportedBotIds, robotsMaxSize, maxRulesNumber, saveDataForAnyBot) +{} + +bool TPrefixTreeRobotsTxtRulesHandler::Empty(const ui32 botId) const { + const auto& botInfo = BotIdToPrefixTreeBotInfo[GetNotOptimizedBotId(botId)]; + return !botInfo || (botInfo->BufferPosition <= sizeof(botInfo->BufferPosition)); +} + +TRobotsTxtRulesIterator TPrefixTreeRobotsTxtRulesHandler::GetRulesIterator(const ui32 botId) const { + const auto& botInfo = BotIdToPrefixTreeBotInfo[GetNotOptimizedBotId(botId)]; + if (!botInfo) { + return {}; + } + return TRobotsTxtRulesIterator(botInfo->Buffer.Get() + sizeof(botInfo->BufferPosition), botInfo->Buffer.Get() + botInfo->BufferPosition); +} + +size_t TPrefixTreeRobotsTxtRulesHandler::GetMemorySize() { + size_t allBotsSize = 0; + for (const auto& botInfo : BotIdToPrefixTreeBotInfo) { + if (!botInfo) { + continue; + } + + allBotsSize += botInfo->PrefixRules.GetMemorySize() + + botInfo->BufferSize * sizeof(char) + + botInfo->ComplexRulesSize * sizeof(char**) + + botInfo->RulesSize * sizeof(char*) + (1 << 8); + } + return allBotsSize; +} + +void TPrefixTreeRobotsTxtRulesHandler::ClearInternal(const ui32 botId) { + if (botId >= BotIdToPrefixTreeBotInfo.size()) { + return; + } + BotIdToPrefixTreeBotInfo[botId].Reset(); + TRobotsTxtRulesHandlerBase::ClearInternal(botId); +} + +bool TPrefixTreeRobotsTxtRulesHandler::OptimizeSize() { + ResetOptimized(); + + TMap<ui64, ui32> hashToBotId; + for (auto botId : LoadedBotIds) { + auto& botInfo = BotIdToPrefixTreeBotInfo[botId]; + if (botInfo->BufferPosition <= sizeof(ui32)) { + botInfo.Reset(); + LoadedBotIds.remove(botId); + continue; + } + + ui64 hash = FnvHash<ui64>(botInfo->Buffer.Get(), botInfo->BufferPosition); + if (auto p = hashToBotId.FindPtr(hash)) { + OptimizedBotIdToStoredBotId[botId] = *p; + ClearInternal(botId); + botInfo.Reset(); + } else { + hashToBotId[hash] = botId; + } + } + + if (IsFullTotal()) { + DoAllowAll(); + return false; + } + + return true; +} + +void TPrefixTreeRobotsTxtRulesHandler::Clear() { + for (size_t botId = 0; botId < robotstxtcfg::max_botid; ++botId) + if (IsBotIdSupported(botId)) + ClearInternal(botId); + TRobotsTxtRulesHandlerBase::Clear(); +} + +void TPrefixTreeRobotsTxtRulesHandler::ResizeBuffer(const ui32 botId, int newSize) { + auto& botInfo = GetInfo(botId); + TArrayHolder<char> newBuffer(new char[newSize]); + memcpy(newBuffer.Get(), botInfo.Buffer.Get(), std::min(botInfo.BufferSize, newSize)); + botInfo.Buffer.Swap(newBuffer); + botInfo.BufferSize = newSize; +} + +bool TPrefixTreeRobotsTxtRulesHandler::AddRule(const ui32 botId, TStringBuf rule, char type) { + if (rule.empty() || rule.Contains('\0')) { + return true; + } + + auto& botInfo = GetInfo(botId); + + if (IsFull(botId, rule.size())) { + DoAllowAll(); + return false; + } + + auto [convertedRule, convertedType] = ConvertRule(rule, type); + const auto len = convertedRule.size() + 2; // 1 byte for convertedType and another for '\0' + + if (auto newPos = botInfo.BufferPosition + len; newPos >= size_t(botInfo.BufferSize)) { + size_t newSize = botInfo.BufferSize; + while (newPos >= newSize) + newSize *= 2; + ResizeBuffer(botId, newSize); + } + + auto out = botInfo.Buffer.Get() + botInfo.BufferPosition; + *out++ = convertedType; + strcpy(out, convertedRule.data()); + botInfo.BufferPosition += len; + + if (type == 'A' || type == 'D') { + botInfo.RulesPosition++; + } + + return true; +} + +const char* TPrefixTreeRobotsTxtRulesHandler::GetRule(const ui32 botId, const char* s, char type) const { + const auto& botInfo = BotIdToPrefixTreeBotInfo[GetNotOptimizedBotId(botId)]; + if (!botInfo) { + return nullptr; + } + + int m = botInfo->RulesPosition + 1; + int k = botInfo->PrefixRules.MinPrefixIndex(s); + if (k >= 0) + m = k; + char* rule; + int j; + for (int i = 0; i < botInfo->ComplexRulesPosition; ++i) { + rule = *botInfo->ComplexRules.Get()[i]; + j = botInfo->ComplexRules.Get()[i] - botInfo->Rules.Get(); + if (j >= m) + break; + if (CheckRule(s, rule)) { + m = j; + break; + } + } + if (m >= botInfo->RulesPosition) + return nullptr; + return toupper(*(botInfo->Rules.Get()[m] - 1)) == type ? botInfo->Rules.Get()[m] : nullptr; +} + +inline bool TPrefixTreeRobotsTxtRulesHandler::IsAllowAll(const ui32 botId) const { + const auto id = GetMappedBotId(botId, false); + auto& botInfo = BotIdToPrefixTreeBotInfo[id ? *id : robotstxtcfg::id_anybot]; + return botInfo && botInfo->AllowAll; +} + +inline bool TPrefixTreeRobotsTxtRulesHandler::IsAllowAll() const { + for (ui32 botId = 0; botId < robotstxtcfg::max_botid; ++botId) + if (robotstxtcfg::IsYandexBotId(botId) && IsBotIdSupported(botId) && !IsAllowAll(botId)) { + return false; + } + + return true; +} + +inline bool TPrefixTreeRobotsTxtRulesHandler::IsDisallowAll(const ui32 botId, bool useAny) const { + const auto id = GetMappedBotId(botId, false); + if (id) { + const auto& botInfo = BotIdToPrefixTreeBotInfo[*id]; + return botInfo && botInfo->DisallowAll; + } + + auto& botInfo = BotIdToPrefixTreeBotInfo[robotstxtcfg::id_anybot]; + return useAny && botInfo && botInfo->DisallowAll; +} + +inline bool TPrefixTreeRobotsTxtRulesHandler::IsDisallowAll() const { + for (ui32 botId = 0; botId < robotstxtcfg::max_botid; ++botId) + if (robotstxtcfg::IsYandexBotId(botId) && IsBotIdSupported(botId) && !IsDisallowAll(botId)) + return false; + + return true; +} + +void TPrefixTreeRobotsTxtRulesHandler::DoAllowAll() { + using robotstxtcfg::id_anybot; + + // Drop all bots to default + SupportedBotIds.insert(id_anybot); + for (ui32 botId = 0; botId < robotstxtcfg::max_botid; ++botId) { + if (IsBotIdSupported(botId)) { + ClearInternal(botId); + OptimizedBotIdToStoredBotId[botId] = id_anybot; + LoadedBotIds.insert(botId); + } + } + + // Initialize anybot with "allow all" rule + AddRule(id_anybot, "/", 'A'); + GetInfo(id_anybot).AllowAll = true; + SaveRulesToBuffer(); +} + +void TPrefixTreeRobotsTxtRulesHandler::DoDisallowAll() { + for (ui32 botId = 0; botId < robotstxtcfg::max_botid; ++botId) { + if (!IsBotIdSupported(botId)) + continue; + ClearInternal(botId); + if (botId == robotstxtcfg::id_anybot) { + auto& botInfo = GetInfo(botId); + AddRule(botId, "/", 'D'); + botInfo.DisallowAll = true; + SaveRulesToBuffer(); + } else { + OptimizedBotIdToStoredBotId[botId] = robotstxtcfg::id_anybot; + } + LoadedBotIds.insert(botId); + } +} + +const char* TPrefixTreeRobotsTxtRulesHandler::IsDisallow(const ui32 botId, const char* s, bool useAny) const { + const auto id = GetMappedBotId(botId, useAny); + if (!id) + return nullptr; + + const auto& botInfo = BotIdToPrefixTreeBotInfo[*id]; + if (botInfo && IsDisallowAll(*id, useAny)) { + int index = (const_cast<TPrefixTreeRobotsTxtRulesHandler*>(this))->FindRuleAll(*botInfo, 'D'); + if (index < 0) { //o_O + return botInfo->Rules.Get()[0]; + } else { + return botInfo->Rules.Get()[index]; + } + } + + return GetRule(*id, s, 'D'); +} + +const char* TPrefixTreeRobotsTxtRulesHandler::IsAllow(const ui32 botId, const char* s) const { + const auto id = GetMappedBotId(botId, true); + if (auto p = GetRule(*id, s, 'A')) + return p; + return GetRule(*id, s, 'D') ? nullptr : "/"; +} + +int TPrefixTreeRobotsTxtRulesHandler::StrLenWithoutStars(const char* s) { + int len = 0; + + for (size_t index = 0; s[index]; ++index) { + if (s[index] != '*') { + ++len; + } + } + + return len; +} + +int TPrefixTreeRobotsTxtRulesHandler::TraceBuffer(const ui32 botId, int countRules, const TArrayHolder<TRuleInfo>* ruleInfos) { + CheckBotIdValidity(botId); + auto& prefixBotInfo = GetInfo(botId); + TBotInfo& botInfo = BotIdToInfo[botId]; + + bool store = countRules >= 0; + if (store) { + prefixBotInfo.Rules.Reset(new char*[prefixBotInfo.RulesSize = countRules]); + } + + int beg = -1, n = 0; + *((int*)prefixBotInfo.Buffer.Get()) = prefixBotInfo.BufferSize; + for (size_t i = sizeof(prefixBotInfo.BufferPosition); i < prefixBotInfo.BufferPosition; ++i) + if (prefixBotInfo.Buffer.Get()[i] == '\n' || prefixBotInfo.Buffer.Get()[i] == 0) { + if (beg < 0 || beg + 1 == (int)i) + continue; + + char* s = prefixBotInfo.Buffer.Get() + beg; + if (store) { + switch (*s) { + case 'H': + HostDirective = s + 1; + break; + case 'S': + SiteMaps.insert(s + 1); + break; + case 'C': + ParseCrawlDelay(s + 1, botInfo.CrawlDelay); + break; + case 'P': + CleanParams.insert(s + 1); + break; + default: + prefixBotInfo.Rules.Get()[n] = s + 1; + (*ruleInfos).Get()[n].Len = StrLenWithoutStars(s + 1); + (*ruleInfos).Get()[n].Allow = toupper(*s) == 'A'; + + prefixBotInfo.HasAllow |= toupper(*s) == 'A'; + prefixBotInfo.HasDisallow |= toupper(*s) == 'D'; + break; + } + } + n += (*s != 'H' && *s != 'S' && *s != 'C' && *s != 'P'); + beg = -1; + } else if (beg < 0) + beg = i; + + return n; +} + +int TPrefixTreeRobotsTxtRulesHandler::FindRuleAll(const TPrefixTreeBotInfo& prefixBotInfo, const char neededType) { + static const char* all[] = {"*", "/", "*/", "/*", "*/*"}; + for (int ruleNumber = prefixBotInfo.RulesSize - 1; ruleNumber >= 0; --ruleNumber) { + const char* curRule = prefixBotInfo.Rules.Get()[ruleNumber]; + char ruleType = *(curRule - 1); + + if (strlen(curRule) > 3) + break; + if (neededType != ruleType) + continue; + + for (size_t i = 0; i < sizeof(all) / sizeof(char*); ++i) + if (strcmp(all[i], curRule) == 0) + return ruleNumber; + } + return -1; +} + +bool TPrefixTreeRobotsTxtRulesHandler::HasDisallowRulePrevAllowAll(const TPrefixTreeBotInfo& prefixBotInfo, int ruleAllAllow) { + for (int ruleNumber = ruleAllAllow - 1; ruleNumber >= 0; --ruleNumber) { + const char* curRule = prefixBotInfo.Rules.Get()[ruleNumber]; + char ruleType = *(curRule - 1); + if (tolower(ruleType) == 'd') + return true; + } + return false; +} + +bool TPrefixTreeRobotsTxtRulesHandler::CheckAllowDisallowAll(const ui32 botId, const bool checkDisallow) { + CheckBotIdValidity(botId); + + auto& botInfo = GetInfo(botId); + + if (botInfo.RulesSize == 0) + return !checkDisallow; + if (botInfo.RulesPosition <= 0) + return 0; + + if (checkDisallow) + return !botInfo.HasAllow && FindRuleAll(botInfo, 'D') >= 0; + int ruleAllAllow = FindRuleAll(botInfo, 'A'); + if (ruleAllAllow == -1) + return !botInfo.HasDisallow; + return !HasDisallowRulePrevAllowAll(botInfo, ruleAllAllow); +} + +void TPrefixTreeRobotsTxtRulesHandler::SortRules( + TPrefixTreeBotInfo& prefixBotInfo, + size_t count, + const TArrayHolder<TRuleInfo>* ruleInfos) { + TVector<size_t> indexes(count); + for (size_t index = 0; index < count; ++index) + indexes[index] = index; + + TRulesSortFunc sortFunc(ruleInfos); + std::sort(indexes.begin(), indexes.end(), sortFunc); + + TArrayHolder<char*> workingCopy; + workingCopy.Reset(new char*[count]); + + for (size_t index = 0; index < count; ++index) + workingCopy.Get()[index] = prefixBotInfo.Rules.Get()[index]; + for (size_t index = 0; index < count; ++index) + prefixBotInfo.Rules.Get()[index] = workingCopy.Get()[indexes[index]]; +} + +void TPrefixTreeRobotsTxtRulesHandler::SaveRulesToBuffer() { + // as sitemaps, clean-params and HostDirective from prefix tree was deleted + for (const auto& sitemap: SiteMaps) + AddRule(robotstxtcfg::id_anybot, sitemap, 'S'); + for (const auto& param : CleanParams) + AddRule(robotstxtcfg::id_anybot, param, 'P'); + if (!HostDirective.empty()) + AddRule(robotstxtcfg::id_anybot, HostDirective, 'H'); +} + +void TPrefixTreeRobotsTxtRulesHandler::SaveRulesFromBuffer(const ui32 botId) { + CheckBotIdValidity(botId); + + auto& botInfo = GetInfo(botId); + + TArrayHolder<TRuleInfo> ruleInfos; + + int n = TraceBuffer(botId, -1, nullptr), countPrefix = 0; + ruleInfos.Reset(new TRuleInfo[n]); + botInfo.RulesPosition = TraceBuffer(botId, n, &ruleInfos); + assert(botInfo.RulesPosition == n); + + SortRules(botInfo, n, &ruleInfos); + + botInfo.DisallowAll = CheckAllowDisallowAll(botId, true); + botInfo.AllowAll = CheckAllowDisallowAll(botId, false); + + for (int i = 0; i < n; ++i) + countPrefix += !!isupper(*(botInfo.Rules.Get()[i] - 1)); + + botInfo.PrefixRules.Init(countPrefix); + botInfo.ComplexRules.Reset(new char**[botInfo.ComplexRulesSize = n - countPrefix]); + botInfo.ComplexRulesPosition = 0; + + for (int i = 0; i < n; ++i) { + char* s = botInfo.Rules.Get()[i]; + if (isupper(*(s - 1))) + botInfo.PrefixRules.Add(s, i); + else + botInfo.ComplexRules.Get()[botInfo.ComplexRulesPosition++] = &botInfo.Rules.Get()[i]; + } + botInfo.PrefixRules.Compress(); +} + +void TPrefixTreeRobotsTxtRulesHandler::AfterParse(const ui32 botId) { + CheckBotIdValidity(botId); + + auto& botInfo = GetInfo(botId); + + ResizeBuffer(botId, botInfo.BufferPosition); + SaveRulesFromBuffer(botId); + + if (botInfo.RulesPosition == 0) { + AddRule(botId, "/", 'A'); + } +} + +TPrefixTreeRobotsTxtRulesHandler::TPrefixTreeBotInfo& TPrefixTreeRobotsTxtRulesHandler::GetInfo(ui32 botId) { + Y_ENSURE(botId < robotstxtcfg::max_botid); + auto& res = BotIdToPrefixTreeBotInfo[botId]; + if (!res) { + res = MakeHolder<TPrefixTreeBotInfo>(); + } + return *res; +} + +bool TPrefixTreeRobotsTxtRulesHandler::CheckRule(const char* s, const char* rule) { + const char* r = rule; + const char* s_end = s + strlen(s); + const char* r_end = r + strlen(r); + // assert( r && !strstr(r, "**") ); + for (; *s; ++s) { + if ((s_end - s + 1) * 2 < (r_end - r)) + return 0; + while (*r == '*') + ++r; + + if (*s == *r) { + ++r; + } else { + while (r != rule && *r != '*') + --r; + + if (*r != '*') + return 0; + if (*r == '*') + ++r; + if (*r == *s) + ++r; + } + } + return !*r || (!*(r + 1) && *r == '*'); +} + +bool TPrefixTreeRobotsTxtRulesHandler::IsFull(ui32 botId, size_t length) const { + Y_ENSURE(botId < robotstxtcfg::max_botid); + const auto& botInfo = BotIdToPrefixTreeBotInfo[botId]; + if (!botInfo) { + return false; + } + + return (size_t(botInfo->RulesPosition) >= MaxRulesNumber) || (botInfo->BufferPosition + length + 300 > size_t(RobotsMaxSize)); +} + +bool TPrefixTreeRobotsTxtRulesHandler::IsFullTotal() const { + size_t allBotsRulesCount = 0; + size_t allBotsBufferSize = 0; + + for (const auto& botInfo : BotIdToPrefixTreeBotInfo) { + if (botInfo) { + allBotsRulesCount += botInfo->RulesPosition; + allBotsBufferSize += botInfo->BufferPosition; + } + } + + return (allBotsRulesCount >= MaxRulesNumber) || (allBotsBufferSize + 300 > size_t(RobotsMaxSize)); +} + +size_t TPrefixTreeRobotsTxtRulesHandler::GetPacked(const char*& data) const { + Y_STATIC_THREAD(TBuffer) + packedRepresentation; + + // calculate size, needed for packed data + size_t totalPackedSize = sizeof(ui32); // num of botids + ui32 numOfSupportedBots = 0; + + for (size_t botId = 0; botId < robotstxtcfg::max_botid; ++botId) { + if (!IsBotIdSupported(botId)) { + continue; + } + + const auto& botInfo = BotIdToPrefixTreeBotInfo[GetNotOptimizedBotId(botId)]; + // botId + packedDataSize + packedData + totalPackedSize += sizeof(ui32) + (botInfo ? botInfo->BufferPosition : sizeof(ui32)); + ++numOfSupportedBots; + } + + ((TBuffer&)packedRepresentation).Reserve(totalPackedSize); + + // fill packed data + char* packedPtr = ((TBuffer&)packedRepresentation).Data(); + + *((ui32*)packedPtr) = numOfSupportedBots; + packedPtr += sizeof(ui32); + + for (size_t botId = 0; botId < robotstxtcfg::max_botid; ++botId) { + if (!IsBotIdSupported(botId)) { + continue; + } + + const auto& botInfo = BotIdToPrefixTreeBotInfo[GetNotOptimizedBotId(botId)]; + memcpy(packedPtr, &botId, sizeof(ui32)); + packedPtr += sizeof(ui32); + + if (botInfo) { + *((ui32*)botInfo->Buffer.Get()) = botInfo->BufferPosition; + memcpy(packedPtr, botInfo->Buffer.Get(), botInfo->BufferPosition); + packedPtr += botInfo->BufferPosition; + } else { + // In absense of bot info we serialize only size of its buffer, which is 4 because it takes 4 bytes + ui32 emptyBufferPosition = sizeof(ui32); + memcpy(packedPtr, &emptyBufferPosition, sizeof(ui32)); + packedPtr += sizeof(ui32); + } + } + + data = ((TBuffer&)packedRepresentation).Data(); + return totalPackedSize; +} + +void TPrefixTreeRobotsTxtRulesHandler::LoadPacked(const char* botsData, const char* botsDataEnd) { + Clear(); + + if (Y_UNLIKELY(botsDataEnd != nullptr && botsData >= botsDataEnd)) { + ythrow yexception() << "Buffer overflow"; + } + + ui32 numOfBots = *((ui32*)botsData); + botsData += sizeof(ui32); + + for (ui32 botIndex = 0; botIndex < numOfBots; ++botIndex) { + if (Y_UNLIKELY(botsDataEnd != nullptr && botsData >= botsDataEnd)) { + ythrow yexception() << "Buffer overflow"; + } + + ui32 botId = 0; + memcpy(&botId, botsData, sizeof(ui32)); + botsData += sizeof(ui32); + + // skip bot id's, that not supported for now + if (botId >= robotstxtcfg::max_botid || !IsBotIdSupported(botId)) { + if (Y_UNLIKELY(botsDataEnd != nullptr && botsData >= botsDataEnd)) { + ythrow yexception() << "Buffer overflow"; + } + + ui32 oneBotPackedSize = 0; + memcpy(&oneBotPackedSize, botsData, sizeof(ui32)); + botsData += oneBotPackedSize; + + continue; + } + + //SupportedBotIds.insert(botId); + + auto& botInfo = GetInfo(botId); + + if (Y_UNLIKELY(botsDataEnd != nullptr && botsData >= botsDataEnd)) { + ythrow yexception() << "Buffer overflow"; + } + + static_assert(sizeof(botInfo.BufferSize) == sizeof(ui32), "BufferSize must be 4 bytes"); + static_assert(sizeof(botInfo.BufferPosition) == sizeof(ui32), "BufferPosition must be 4 bytes"); + + memcpy(&botInfo.BufferSize, botsData, sizeof(ui32)); + memcpy(&botInfo.BufferPosition, botsData, sizeof(ui32)); + + if (Y_UNLIKELY(botsDataEnd != nullptr && (botsData + botInfo.BufferSize) > botsDataEnd)) { + ythrow yexception() << "Buffer overflow"; + } + + botInfo.Buffer.Reset(new char[botInfo.BufferSize]); + memcpy(botInfo.Buffer.Get(), botsData, botInfo.BufferSize); + SaveRulesFromBuffer(botId); + + if (botInfo.BufferSize > (int)sizeof(ui32)) { // empty data for robots means, that we don't have section for this bot + LoadedBotIds.insert(botId); + } + + botsData += botInfo.BufferSize; + } + + OptimizeSize(); +} + +void TPrefixTreeRobotsTxtRulesHandler::Dump(const ui32 botId, FILE* dumpFile) { + if (!dumpFile) + dumpFile = stderr; + fprintf(dumpFile, "User-Agent: %s\n", robotstxtcfg::GetFullName(botId).data()); + for (TRobotsTxtRulesIterator it = GetRulesIterator(botId); it.HasRule(); it.Next()) + fprintf(dumpFile, "%s: %s\n", DirTypeToName(it.GetRuleType()), it.GetInitialRule().data()); +} + +void TPrefixTreeRobotsTxtRulesHandler::Dump(const ui32 botId, IOutputStream& out) { + out << "User-Agent: " << robotstxtcfg::GetFullName(botId) << Endl; + for (TRobotsTxtRulesIterator it = GetRulesIterator(botId); it.HasRule(); it.Next()) + out << DirTypeToName(it.GetRuleType()) << ": " << it.GetInitialRule() << Endl; +} diff --git a/library/cpp/robots_txt/robots_txt.h b/library/cpp/robots_txt/robots_txt.h new file mode 100644 index 0000000000..5ee48fb14f --- /dev/null +++ b/library/cpp/robots_txt/robots_txt.h @@ -0,0 +1,605 @@ +#pragma once + +#include "constants.h" +#include "robots_txt_parser.h" +#include "prefix_tree.h" +#include "robotstxtcfg.h" + +#include <util/generic/noncopyable.h> +#include <util/generic/map.h> +#include <util/generic/maybe.h> +#include <util/generic/ptr.h> +#include <util/generic/set.h> + +#include <array> +#include <utility> + + +enum EDirectiveType { + USER_AGENT = 1, + DISALLOW = 2, + ALLOW = 3, + HOST = 4, + SITEMAP = 5, + CRAWL_DELAY = 6, + CLEAN_PARAM = 7, + UNKNOWN = 9, +}; + +enum EFormatErrorType { + ERROR_RULE_NOT_SLASH = 1, + ERROR_ASTERISK_MULTI = 2, + ERROR_HOST_MULTI = 3, + ERROR_ROBOTS_HUGE = 4, + ERROR_RULE_BEFORE_USER_AGENT = 5, + ERROR_RULE_HUGE = 6, + ERROR_HOST_FORMAT = 7, + ERROR_TRASH = 8, + ERROR_SITEMAP_FORMAT = 9, + ERROR_CRAWL_DELAY_FORMAT = 10, + ERROR_CRAWL_DELAY_MULTI = 11, + ERROR_CLEAN_PARAM_FORMAT = 12, + + WARNING_EMPTY_RULE = 30, + WARNING_SUSPECT_SYMBOL = 31, + WARNING_UNKNOWN_FIELD = 33, + WARNING_UPPER_REGISTER = 34, + WARNING_SITEMAP = 35, +}; + +class TRobotsTxtRulesIterator { +private: + const char* Begin = nullptr; + const char* End = nullptr; + +public: + TRobotsTxtRulesIterator() = default; + TRobotsTxtRulesIterator(const char* begin, const char* end); + void Next(); + bool HasRule() const; + const char* GetRule() const; + TString GetInitialRule() const; // unlike GetRule(), it neither omits trailing '$' nor adds redundant '*' + EDirectiveType GetRuleType() const; + + static EDirectiveType CharToDirType(char ch); +}; + +class TRobotsTxtRulesHandlerBase { +public: + typedef TVector<std::pair<EFormatErrorType, int>> TErrorVector; + + TRobotsTxtRulesHandlerBase( + TBotIdSet supportedBotIds, + int robotsMaxSize, + int maxRulesNumber, + bool saveDataForAnyBot); + + TRobotsTxtRulesHandlerBase( + const TSet<ui32>& supportedBotIds, + int robotsMaxSize, + int maxRulesNumber, + bool saveDataForAnyBot); + + virtual ~TRobotsTxtRulesHandlerBase(); + + int GetCrawlDelay(ui32 botId, bool* realInfo = nullptr) const; + int GetMinCrawlDelay(int defaultCrawlDelay = -1) const; + bool IsHandlingErrors() const; + const TString& GetHostDirective() const; + const TVector<TString> GetSiteMaps() const; + const TVector<TString> GetCleanParams() const; + const TErrorVector& GetErrors() const; + TVector<int> GetAcceptedLines(ui32 botId = robotstxtcfg::id_yandexbot) const; + + template <class THostHandler> + static int ParseRules(TRobotsTxtParser& parser, TRobotsTxtRulesHandlerBase* rulesHandler, THostHandler* hostHandler, const char* host = nullptr); + static inline void ClearAllExceptCrossSection(TRobotsTxtParser& parser, TRobotsTxtRulesHandlerBase* rulesHandler, ui32 botId); + static int CheckHost(const char* host); + static int CheckSitemapUrl(const char* url, const char* host, TString& modifiedUrl); + static int CheckRule(const char* value, int line, TRobotsTxtRulesHandlerBase* rulesHandler); + static int CheckAndNormCleanParam(TString& s); + static int ParseCrawlDelay(const char* value, int& crawlDelay); + static EDirectiveType NameToDirType(const char* d); + static const char* DirTypeToName(EDirectiveType t); + + void SetErrorsHandling(bool handleErrors); + void SetHostDirective(const char* hostDirective); + void SetCrawlDelay(ui32 botId, int crawlDelay); + void AddAcceptedLine(ui32 line, const TBotIdSet& botIds, bool isCrossSection); + void AddSiteMap(const char* sitemap); + void AddCleanParam(const char* cleanParam); + bool AddRuleWithErrorCheck(ui32 botId, TStringBuf rule, char type, TRobotsTxtParser& parser); + int OnHost(ui32 botId, TRobotsTxtParser& parser, const char* value, TRobotsTxtRulesHandlerBase*& rulesHandler); + + virtual void Clear(); + virtual bool IsAllowAll(ui32 botId) const = 0; + virtual bool IsAllowAll() const = 0; + virtual bool IsDisallowAll(ui32 botId, bool useAny = true) const = 0; + virtual bool IsDisallowAll() const = 0; + virtual const char* IsDisallow(ui32 botId, const char* s, bool useAny = true) const = 0; + virtual const char* IsAllow(ui32 botId, const char* s) const = 0; + virtual TRobotsTxtRulesIterator GetRulesIterator(ui32 botId) const = 0; + virtual void Dump(ui32 botId, FILE* logFile) = 0; + virtual void Dump(ui32 botId, IOutputStream& out) = 0; + virtual bool Empty(ui32 botId) const = 0; + virtual void LoadPacked(const char* botsData, const char* botsDataEnd = nullptr) = 0; + virtual size_t GetPacked(const char*& data) const = 0; + virtual void AfterParse(ui32 botId) = 0; + virtual void DoAllowAll() = 0; + virtual void DoDisallowAll() = 0; + bool IsBotIdLoaded(ui32 botId) const; + bool IsBotIdSupported(ui32 botId) const; + ui32 GetNotOptimizedBotId(ui32 botId) const; + TMaybe<ui32> GetMappedBotId(ui32 botId, bool useAny = true) const; + +protected: + void CheckBotIdValidity(ui32 botId) const; + virtual bool OptimizeSize() = 0; + +private: + bool HandleErrors; + +protected: + struct TBotInfo { + int CrawlDelay; + + TBotInfo() + : CrawlDelay(-1) + { + } + }; + + TBotIdSet LoadedBotIds; + TSet<TString> SiteMaps; + TSet<TString> CleanParams; + TString HostDirective; + TErrorVector Errors; + typedef std::pair<ui32, ui32> TBotIdAcceptedLine; + TVector<TBotIdAcceptedLine> AcceptedLines; + TVector<ui32> CrossSectionAcceptedLines; + + TVector<TBotInfo> BotIdToInfo; + int CrawlDelay; + size_t RobotsMaxSize; + size_t MaxRulesNumber; + bool SaveDataForAnyBot; + + TBotIdSet SupportedBotIds; + std::array<ui8, robotstxtcfg::max_botid> OptimizedBotIdToStoredBotId; + + virtual bool IsFull(ui32 botId, size_t length) const = 0; + virtual bool IsFullTotal() const = 0; + virtual bool AddRule(ui32 botId, TStringBuf rule, char type) = 0; + //parts of ParseRules + inline static void CheckRobotsLines(TRobotsTxtRulesHandlerBase* rulesHandler, TVector<int>& nonRobotsLines); + inline static void CheckAsterisk(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, ui32 lineNumber, bool& wasAsterisk); + inline static bool CheckWasUserAgent(TRobotsTxtRulesHandlerBase* rulesHandler, bool wasUserAgent, bool& ruleBeforeUserAgent, bool& wasRule, ui32 lineNumber); + inline static bool CheckRuleNotSlash(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, ui32 lineNumber); + inline static bool CheckSupportedBots(const TBotIdSet& currentBotIds, TBotIdSet& wasRuleForBot, const TBotIdSet& isSupportedBot); + inline static bool CheckEmptyRule(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, EDirectiveType& type, ui32 lineNumber); + inline static bool ProcessSitemap(TRobotsTxtRulesHandlerBase* rulesHandler, TRobotsTxtParser& parser, const char* value, const char* host); + inline static bool ProcessCleanParam(TRobotsTxtRulesHandlerBase* rulesHandler, TRobotsTxtParser& parser, TString& value); + inline static bool AddRules( + TRobotsTxtRulesHandlerBase* rulesHandler, + TRobotsTxtParser& parser, + const char* value, + char type, + const TBotIdSet& currentBotIds, + const TBotIdSet& isSupportedBot); + + inline static bool ProcessCrawlDelay( + TRobotsTxtRulesHandlerBase* rulesHandler, + TRobotsTxtParser& parser, + const TBotIdSet& currentBotIds, + const TBotIdSet& isSupportedBot, + const char* value); + + inline static void ProcessUserAgent( + TRobotsTxtRulesHandlerBase* rulesHandler, + TRobotsTxtParser& parser, + const TBotIdSet& currentBotIds, + TBotIdSet& wasRuleForBot, + TBotIdSet& isSupportedBot, + TVector<ui32>& botIdToMaxAppropriateUserAgentNameLength, + const char* value); + + bool CheckRobot( + const char* userAgent, + TBotIdSet& botIds, + const TVector<ui32>* botIdToMaxAppropriateUserAgentNameLength = nullptr) const; + + virtual void ClearInternal(ui32 botId); + + void AddError(EFormatErrorType type, int line); + + void ResetOptimized() noexcept; +}; + +class TPrefixTreeRobotsTxtRulesHandler: public TRobotsTxtRulesHandlerBase, TNonCopyable { +private: + static const int INIT_BUFFER_SIZE = 1 << 6; + + struct TRuleInfo { + size_t Len; + bool Allow; + }; + + bool IsFull(ui32 botId, size_t length) const override; + bool IsFullTotal() const override; + bool AddRule(ui32 botId, TStringBuf rule, char type) override; + const char* GetRule(ui32 botId, const char* s, char type) const; + void ResizeBuffer(ui32 botId, int newSize); + void SaveRulesFromBuffer(ui32 botId); + int TraceBuffer(ui32 botId, int countRules, const TArrayHolder<TRuleInfo>* ruleInfos); + bool CheckAllowDisallowAll(ui32 botId, bool checkDisallow); + void SaveRulesToBuffer(); + int StrLenWithoutStars(const char* s); + +protected: + class TRulesSortFunc { + private: + const TArrayHolder<TRuleInfo>* RuleInfos; + + public: + TRulesSortFunc(const TArrayHolder<TRuleInfo>* ruleInfos) + : RuleInfos(ruleInfos) + { + } + bool operator()(const size_t& lhs, const size_t& rhs) { + const TRuleInfo& left = (*RuleInfos).Get()[lhs]; + const TRuleInfo& right = (*RuleInfos).Get()[rhs]; + return (left.Len == right.Len) ? left.Allow && !right.Allow : left.Len > right.Len; + } + }; + + struct TPrefixTreeBotInfo { + bool DisallowAll = false; + bool AllowAll = false; + bool HasDisallow = false; + bool HasAllow = false; + + TArrayHolder<char> Buffer{new char[INIT_BUFFER_SIZE]}; + ui32 BufferPosition = sizeof(BufferPosition); + int BufferSize = INIT_BUFFER_SIZE; + + TArrayHolder<char*> Rules = nullptr; + int RulesPosition = 0; + int RulesSize = 0; + + TArrayHolder<char**> ComplexRules = nullptr; + int ComplexRulesPosition = 0; + int ComplexRulesSize = 0; + + TPrefixTree PrefixRules {0}; + }; + + std::array<THolder<TPrefixTreeBotInfo>, robotstxtcfg::max_botid> BotIdToPrefixTreeBotInfo; + + TPrefixTreeBotInfo& GetInfo(ui32 botId); + static bool CheckRule(const char* s, const char* rule); + void ClearInternal(ui32 botId) override; + bool OptimizeSize() override; + +private: + void SortRules(TPrefixTreeBotInfo& prefixBotInfo, size_t count, const TArrayHolder<TRuleInfo>* ruleInfos); + bool HasDisallowRulePrevAllowAll(const TPrefixTreeBotInfo& prefixBotInfo, int ruleAllAllow); + int FindRuleAll(const TPrefixTreeBotInfo& prefixBotInfo, char neededType); + +public: + TPrefixTreeRobotsTxtRulesHandler( + TBotIdSet supportedBotIds = robotstxtcfg::defaultSupportedBotIds, + int robotsMaxSize = robots_max, + int maxRulesCount = -1, + bool saveDataForAnyBot = true); + + TPrefixTreeRobotsTxtRulesHandler( + std::initializer_list<ui32> supportedBotIds, + int robotsMaxSize = robots_max, + int maxRulesCount = -1, + bool saveDataForAnyBot = true); + + TPrefixTreeRobotsTxtRulesHandler( + const TSet<ui32>& supportedBotIds, + int robotsMaxSize = robots_max, + int maxRulesCount = -1, + bool saveDataForAnyBot = true); + + void Clear() override; + void AfterParse(ui32 botId) override; + bool IsAllowAll(ui32 botId) const override; + bool IsAllowAll() const override; + bool IsDisallowAll(ui32 botId, bool useAny = true) const override; + bool IsDisallowAll() const override; + const char* IsDisallow(ui32 botId, const char* s, bool useAny = true) const override; + const char* IsAllow(ui32 botId, const char* s) const override; + TRobotsTxtRulesIterator GetRulesIterator(ui32 botId) const override; + void DoAllowAll() override; + void DoDisallowAll() override; + bool Empty(ui32 botId) const override; + + void LoadPacked(const char* botsData, const char* botsDataEnd = nullptr) override; + size_t GetPacked(const char*& data) const override; + void Dump(ui32 botId, FILE* logFile) override; + void Dump(ui32 botId, IOutputStream& out) override; + size_t GetMemorySize(); +}; + +using TRobotsTxt = TPrefixTreeRobotsTxtRulesHandler; + +void TRobotsTxtRulesHandlerBase::ClearAllExceptCrossSection(TRobotsTxtParser& parser, TRobotsTxtRulesHandlerBase* rulesHandler, ui32 botId) { + rulesHandler->ClearInternal(botId); + if (botId == robotstxtcfg::id_anybot) { + // as sitemaps, clean-params and HostDirective from prefix tree was deleted + for (const auto& sitemap : rulesHandler->SiteMaps) { + rulesHandler->AddRuleWithErrorCheck(robotstxtcfg::id_anybot, sitemap, 'S', parser); + } + for (const auto& param : rulesHandler->CleanParams) { + rulesHandler->AddRuleWithErrorCheck(robotstxtcfg::id_anybot, param, 'P', parser); + } + if (!rulesHandler->HostDirective.empty()) { + rulesHandler->AddRuleWithErrorCheck(robotstxtcfg::id_anybot, rulesHandler->HostDirective, 'H', parser); + } + } +} + +void TRobotsTxtRulesHandlerBase::CheckRobotsLines(TRobotsTxtRulesHandlerBase* rulesHandler, TVector<int>& nonRobotsLines) { + if (rulesHandler->IsHandlingErrors()) { + for (size_t i = 0; i < nonRobotsLines.size(); ++i) + rulesHandler->AddError(ERROR_TRASH, nonRobotsLines[i]); + nonRobotsLines.clear(); + } +} + +void TRobotsTxtRulesHandlerBase::CheckAsterisk(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, ui32 lineNumber, bool& wasAsterisk) { + if (strcmp(value, "*") == 0) { + if (wasAsterisk) + rulesHandler->AddError(ERROR_ASTERISK_MULTI, lineNumber); + wasAsterisk = true; + } +} + +bool TRobotsTxtRulesHandlerBase::CheckWasUserAgent(TRobotsTxtRulesHandlerBase* rulesHandler, bool wasUserAgent, bool& ruleBeforeUserAgent, bool& wasRule, ui32 lineNumber) { + if (wasUserAgent) { + wasRule = true; + return false; + } + if (!ruleBeforeUserAgent) { + ruleBeforeUserAgent = true; + rulesHandler->AddError(ERROR_RULE_BEFORE_USER_AGENT, lineNumber); + } + return true; +} + +bool TRobotsTxtRulesHandlerBase::CheckRuleNotSlash(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, ui32 lineNumber) { + if (*value && *value != '/' && *value != '*') { + rulesHandler->AddError(ERROR_RULE_NOT_SLASH, lineNumber); + return true; + } + return false; +} + +bool TRobotsTxtRulesHandlerBase::CheckSupportedBots( + const TBotIdSet& currentBotIds, + TBotIdSet& wasRuleForBot, + const TBotIdSet& isSupportedBot) +{ + bool hasAtLeastOneSupportedBot = false; + for (ui32 currentBotId : currentBotIds) { + wasRuleForBot.insert(currentBotId); + hasAtLeastOneSupportedBot = hasAtLeastOneSupportedBot || isSupportedBot.contains(currentBotId); + } + return hasAtLeastOneSupportedBot; +} + +bool TRobotsTxtRulesHandlerBase::CheckEmptyRule(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, EDirectiveType& type, ui32 lineNumber) { + if (value && strlen(value) == 0) { + rulesHandler->AddError(WARNING_EMPTY_RULE, lineNumber); + type = type == ALLOW ? DISALLOW : ALLOW; + return true; + } + return false; +} + +bool TRobotsTxtRulesHandlerBase::AddRules( + TRobotsTxtRulesHandlerBase* rulesHandler, + TRobotsTxtParser& parser, + const char* value, + char type, + const TBotIdSet& currentBotIds, + const TBotIdSet& isSupportedBot) +{ + for (ui32 currentBotId : currentBotIds) { + if (!isSupportedBot.contains(currentBotId)) + continue; + if (!rulesHandler->AddRuleWithErrorCheck(currentBotId, value, type, parser)) + return true; + } + return false; +} + +bool TRobotsTxtRulesHandlerBase::ProcessSitemap(TRobotsTxtRulesHandlerBase* rulesHandler, TRobotsTxtParser& parser, const char* value, const char* host) { + TString modifiedUrl; + if (!CheckSitemapUrl(value, host, modifiedUrl)) + rulesHandler->AddError(ERROR_SITEMAP_FORMAT, parser.GetLineNumber()); + else { + rulesHandler->AddSiteMap(modifiedUrl.data()); + if (!rulesHandler->AddRuleWithErrorCheck(robotstxtcfg::id_anybot, modifiedUrl.data(), 'S', parser)) + return true; + } + return false; +} + +bool TRobotsTxtRulesHandlerBase::ProcessCleanParam(TRobotsTxtRulesHandlerBase* rulesHandler, TRobotsTxtParser& parser, TString& value) { + if (!CheckAndNormCleanParam(value)) + rulesHandler->AddError(ERROR_CLEAN_PARAM_FORMAT, parser.GetLineNumber()); + else { + rulesHandler->AddCleanParam(value.data()); + if (!rulesHandler->AddRuleWithErrorCheck(robotstxtcfg::id_anybot, value.data(), 'P', parser)) + return true; + } + return false; +} + +bool TRobotsTxtRulesHandlerBase::ProcessCrawlDelay( + TRobotsTxtRulesHandlerBase* rulesHandler, + TRobotsTxtParser& parser, + const TBotIdSet& currentBotIds, + const TBotIdSet& isSupportedBot, + const char* value) { + for (ui32 currentBotId : currentBotIds) { + if (!isSupportedBot.contains(currentBotId)) + continue; + if (rulesHandler->BotIdToInfo[currentBotId].CrawlDelay >= 0) { + rulesHandler->AddError(ERROR_CRAWL_DELAY_MULTI, parser.GetLineNumber()); + break; + } + int crawlDelay = -1; + if (!ParseCrawlDelay(value, crawlDelay)) + rulesHandler->AddError(ERROR_CRAWL_DELAY_FORMAT, parser.GetLineNumber()); + else { + rulesHandler->SetCrawlDelay(currentBotId, crawlDelay); + if (!rulesHandler->AddRuleWithErrorCheck(currentBotId, value, 'C', parser)) + return true; + } + } + return false; +} + +void TRobotsTxtRulesHandlerBase::ProcessUserAgent( + TRobotsTxtRulesHandlerBase* rulesHandler, + TRobotsTxtParser& parser, + const TBotIdSet& currentBotIds, + TBotIdSet& wasSupportedBot, + TBotIdSet& isSupportedBot, + TVector<ui32>& botIdToMaxAppropriateUserAgentNameLength, + const char* value) +{ + ui32 userAgentNameLength = (ui32)strlen(value); + + for (ui32 currentBotId : currentBotIds) { + bool userAgentNameLonger = userAgentNameLength > botIdToMaxAppropriateUserAgentNameLength[currentBotId]; + bool userAgentNameSame = userAgentNameLength == botIdToMaxAppropriateUserAgentNameLength[currentBotId]; + + if (!wasSupportedBot.contains(currentBotId) || userAgentNameLonger) + ClearAllExceptCrossSection(parser, rulesHandler, currentBotId); + + wasSupportedBot.insert(currentBotId); + if (userAgentNameLonger || userAgentNameSame) { + isSupportedBot.insert(currentBotId); // Allow multiple blocks for the same user agent + } + botIdToMaxAppropriateUserAgentNameLength[currentBotId] = Max(userAgentNameLength, botIdToMaxAppropriateUserAgentNameLength[currentBotId]); + } +} + +template <class THostHandler> +int TRobotsTxtRulesHandlerBase::ParseRules(TRobotsTxtParser& parser, TRobotsTxtRulesHandlerBase* rulesHandler, THostHandler* hostHandler, const char* host) { + rulesHandler->Clear(); + + TBotIdSet wasSupportedBot; + TBotIdSet wasRuleForBot; + bool wasAsterisk = false; + TVector<int> nonRobotsLines; + TVector<ui32> botIdToMaxAppropriateUserAgentNameLength(robotstxtcfg::max_botid, 0); + static char all[] = "/"; + EDirectiveType prevType = USER_AGENT; + while (parser.HasRecord()) { + TRobotsTxtRulesRecord record = parser.NextRecord(); + bool wasUserAgent = false; + bool isRobotsRecordUseful = false; + TBotIdSet isSupportedBot; + TBotIdSet currentBotIds; + TString field; + TString value; + bool ruleBeforeUserAgent = false; + int ret = 0; + bool wasRule = false; + bool wasBlank = false; + while (record.NextPair(field, value, isRobotsRecordUseful && rulesHandler->IsHandlingErrors(), nonRobotsLines, &wasBlank)) { + CheckRobotsLines(rulesHandler, nonRobotsLines); + EDirectiveType type = NameToDirType(field.data()); + EDirectiveType typeBeforeChange = type; + + if ((prevType != type || wasBlank) && type == USER_AGENT) { + currentBotIds.clear(); + } + prevType = type; + + switch (type) { + case USER_AGENT: + if (wasUserAgent && wasRule) { + wasRule = false; + currentBotIds.clear(); + isSupportedBot.clear(); + } + wasUserAgent = true; + value.to_lower(); + CheckAsterisk(rulesHandler, value.data(), parser.GetLineNumber(), wasAsterisk); + isRobotsRecordUseful = rulesHandler->CheckRobot(value.data(), currentBotIds, &botIdToMaxAppropriateUserAgentNameLength); + if (isRobotsRecordUseful) + ProcessUserAgent(rulesHandler, parser, currentBotIds, wasSupportedBot, isSupportedBot, botIdToMaxAppropriateUserAgentNameLength, value.data()); + break; + + case DISALLOW: + case ALLOW: + if (CheckWasUserAgent(rulesHandler, wasUserAgent, ruleBeforeUserAgent, wasRule, parser.GetLineNumber())) + break; + if (CheckRuleNotSlash(rulesHandler, value.data(), parser.GetLineNumber())) + break; + CheckRule(value.data(), parser.GetLineNumber(), rulesHandler); + if (!CheckSupportedBots(currentBotIds, wasRuleForBot, isSupportedBot)) { + break; + } + if (CheckEmptyRule(rulesHandler, value.data(), type, parser.GetLineNumber())) { + value = all; + if (typeBeforeChange == ALLOW) + continue; + } + + if (AddRules(rulesHandler, parser, value.data(), type == ALLOW ? 'A' : 'D', currentBotIds, isSupportedBot)) + return 2; + break; + + case HOST: + value.to_lower(); + ret = hostHandler->OnHost(robotstxtcfg::id_anybot, parser, value.data(), rulesHandler); + if (ret) + return ret; + break; + + case SITEMAP: + if (ProcessSitemap(rulesHandler, parser, value.data(), host)) + return 2; + break; + + case CLEAN_PARAM: + if (ProcessCleanParam(rulesHandler, parser, value)) + return 2; + break; + + case CRAWL_DELAY: + if (ProcessCrawlDelay(rulesHandler, parser, currentBotIds, isSupportedBot, value.data())) + return 2; + break; + + default: + rulesHandler->AddError(WARNING_UNKNOWN_FIELD, parser.GetLineNumber()); + break; + } + bool isCrossSection = type == SITEMAP || type == HOST || type == CLEAN_PARAM; + if (rulesHandler->IsHandlingErrors() && (isRobotsRecordUseful || isCrossSection)) + rulesHandler->AddAcceptedLine(parser.GetLineNumber(), currentBotIds, isCrossSection); + } + } + + for (auto botId : wasSupportedBot) { + rulesHandler->LoadedBotIds.insert(botId); + if (rulesHandler->IsBotIdSupported(botId)) + rulesHandler->AfterParse(botId); + } + + if (!rulesHandler->OptimizeSize()) { + return 2; + } + + return 1; +} diff --git a/library/cpp/robots_txt/robots_txt_parser.cpp b/library/cpp/robots_txt/robots_txt_parser.cpp new file mode 100644 index 0000000000..8e2fe6073d --- /dev/null +++ b/library/cpp/robots_txt/robots_txt_parser.cpp @@ -0,0 +1,116 @@ +#include "robots_txt_parser.h" +#include <util/generic/string.h> +#include <util/stream/output.h> + +TRobotsTxtParser::TRobotsTxtParser(IInputStream& inputStream) + : InputStream(inputStream) + , LineNumber(0) + , IsLastSymbolCR(false) +{ +} + +int TRobotsTxtParser::GetLineNumber() { + return LineNumber; +} + +const char* TRobotsTxtParser::ReadLine() { + Line = ""; + char c; + + if (IsLastSymbolCR) { + if (!InputStream.ReadChar(c)) + return nullptr; + if (c != '\n') + Line.append(c); + } + + bool hasMoreSymbols; + while (hasMoreSymbols = InputStream.ReadChar(c)) { + if (c == '\r') { + IsLastSymbolCR = true; + break; + } else { + IsLastSymbolCR = false; + if (c == '\n') + break; + Line.append(c); + } + } + if (!hasMoreSymbols && Line.empty()) + return nullptr; + + // BOM UTF-8: EF BB BF + if (0 == LineNumber && Line.size() >= 3 && Line[0] == '\xEF' && Line[1] == '\xBB' && Line[2] == '\xBF') + Line = Line.substr(3, Line.size() - 3); + + ++LineNumber; + int i = Line.find('#'); + if (i == 0) + Line = ""; + else if (i > 0) + Line = Line.substr(0, i); + return Line.data(); +} + +bool TRobotsTxtParser::IsBlankLine(const char* s) { + for (const char* p = s; *p; ++p) + if (!isspace(*p)) + return 0; + return 1; +} + +char* TRobotsTxtParser::Trim(char* s) { + while (isspace(*s)) + ++s; + char* p = s + strlen(s) - 1; + while (s < p && isspace(*p)) + --p; + *(p + 1) = 0; + return s; +} + +inline bool TRobotsTxtParser::IsRobotsLine(const char* s) { + return strchr(s, ':'); +} + +bool TRobotsTxtParser::HasRecord() { + while (!IsRobotsLine(Line.data())) + if (!ReadLine()) + return 0; + return 1; +} + +TRobotsTxtRulesRecord TRobotsTxtParser::NextRecord() { + return TRobotsTxtRulesRecord(*this); +} + +TRobotsTxtRulesRecord::TRobotsTxtRulesRecord(TRobotsTxtParser& parser) + : Parser(parser) +{ +} + +bool TRobotsTxtRulesRecord::NextPair(TString& field, TString& value, bool handleErrors, TVector<int>& nonRobotsLines, bool* wasBlank) { + if (wasBlank) { + *wasBlank = false; + } + while (!Parser.IsRobotsLine(Parser.Line.data())) { + if (!Parser.ReadLine()) + return 0; + if (Parser.IsBlankLine(Parser.Line.data())) { + if (wasBlank) { + *wasBlank = true; + } + continue; + } + if (handleErrors && !Parser.IsRobotsLine(Parser.Line.data())) + nonRobotsLines.push_back(Parser.GetLineNumber()); + } + + char* s = strchr(Parser.Line.begin(), ':'); + *s = 0; + char* p = s + 1; + + field = TRobotsTxtParser::Trim(strlwr(Parser.Line.begin())); + value = TRobotsTxtParser::Trim(p); + return 1; +} diff --git a/library/cpp/robots_txt/robots_txt_parser.h b/library/cpp/robots_txt/robots_txt_parser.h new file mode 100644 index 0000000000..8032d0d20b --- /dev/null +++ b/library/cpp/robots_txt/robots_txt_parser.h @@ -0,0 +1,38 @@ +#pragma once + +#include <algorithm> +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/stream/input.h> + +class TRobotsTxtParser; + +class TRobotsTxtRulesRecord { +private: + TRobotsTxtParser& Parser; + +public: + TRobotsTxtRulesRecord(TRobotsTxtParser& parser); + bool NextPair(TString& field, TString& value, bool handleErrors, TVector<int>& nonRobotsLines, bool* wasBlank = nullptr); +}; + +class TRobotsTxtParser { + friend class TRobotsTxtRulesRecord; + +private: + IInputStream& InputStream; + TString Line; + int LineNumber; + bool IsLastSymbolCR; + + const char* ReadLine(); + static bool IsBlankLine(const char*); + static bool IsRobotsLine(const char*); + +public: + static char* Trim(char*); + TRobotsTxtParser(IInputStream& inputStream); + bool HasRecord(); + TRobotsTxtRulesRecord NextRecord(); + int GetLineNumber(); +}; diff --git a/library/cpp/robots_txt/robotstxtcfg.h b/library/cpp/robots_txt/robotstxtcfg.h new file mode 100644 index 0000000000..5ca1682a0c --- /dev/null +++ b/library/cpp/robots_txt/robotstxtcfg.h @@ -0,0 +1,3 @@ +#pragma once + +#include <library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.h> diff --git a/library/cpp/robots_txt/robotstxtcfg/CMakeLists.darwin-x86_64.txt b/library/cpp/robots_txt/robotstxtcfg/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..09cfd4b3f1 --- /dev/null +++ b/library/cpp/robots_txt/robotstxtcfg/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,20 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-robots_txt-robotstxtcfg) +target_link_libraries(cpp-robots_txt-robotstxtcfg PUBLIC + contrib-libs-cxxsupp + yutil + library-cpp-case_insensitive_string +) +target_sources(cpp-robots_txt-robotstxtcfg PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/bot_id_set.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/user_agents.cpp +) diff --git a/library/cpp/robots_txt/robotstxtcfg/CMakeLists.linux-aarch64.txt b/library/cpp/robots_txt/robotstxtcfg/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..6fe7e7a7ad --- /dev/null +++ b/library/cpp/robots_txt/robotstxtcfg/CMakeLists.linux-aarch64.txt @@ -0,0 +1,21 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-robots_txt-robotstxtcfg) +target_link_libraries(cpp-robots_txt-robotstxtcfg PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + library-cpp-case_insensitive_string +) +target_sources(cpp-robots_txt-robotstxtcfg PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/bot_id_set.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/user_agents.cpp +) diff --git a/library/cpp/robots_txt/robotstxtcfg/CMakeLists.linux-x86_64.txt b/library/cpp/robots_txt/robotstxtcfg/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..6fe7e7a7ad --- /dev/null +++ b/library/cpp/robots_txt/robotstxtcfg/CMakeLists.linux-x86_64.txt @@ -0,0 +1,21 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-robots_txt-robotstxtcfg) +target_link_libraries(cpp-robots_txt-robotstxtcfg PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + library-cpp-case_insensitive_string +) +target_sources(cpp-robots_txt-robotstxtcfg PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/bot_id_set.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/user_agents.cpp +) diff --git a/library/cpp/robots_txt/robotstxtcfg/CMakeLists.txt b/library/cpp/robots_txt/robotstxtcfg/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/library/cpp/robots_txt/robotstxtcfg/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/library/cpp/robots_txt/robotstxtcfg/CMakeLists.windows-x86_64.txt b/library/cpp/robots_txt/robotstxtcfg/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..09cfd4b3f1 --- /dev/null +++ b/library/cpp/robots_txt/robotstxtcfg/CMakeLists.windows-x86_64.txt @@ -0,0 +1,20 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-robots_txt-robotstxtcfg) +target_link_libraries(cpp-robots_txt-robotstxtcfg PUBLIC + contrib-libs-cxxsupp + yutil + library-cpp-case_insensitive_string +) +target_sources(cpp-robots_txt-robotstxtcfg PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/bot_id_set.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/user_agents.cpp +) diff --git a/library/cpp/robots_txt/robotstxtcfg/bot_id_set.cpp b/library/cpp/robots_txt/robotstxtcfg/bot_id_set.cpp new file mode 100644 index 0000000000..aec668582c --- /dev/null +++ b/library/cpp/robots_txt/robotstxtcfg/bot_id_set.cpp @@ -0,0 +1,2 @@ +#include "bot_id_set.h" +// header compile test diff --git a/library/cpp/robots_txt/robotstxtcfg/bot_id_set.h b/library/cpp/robots_txt/robotstxtcfg/bot_id_set.h new file mode 100644 index 0000000000..08aaa68a50 --- /dev/null +++ b/library/cpp/robots_txt/robotstxtcfg/bot_id_set.h @@ -0,0 +1,132 @@ +#pragma once + +#include "user_agents.h" + +#include <bitset> + + +/// Simple vector-based set for bot ids, meant to optimize memory and lookups +class TBotIdSet +{ +public: + using TData = std::bitset<robotstxtcfg::max_botid>; + + constexpr TBotIdSet() noexcept = default; + constexpr TBotIdSet(const TBotIdSet&) noexcept = default; + constexpr TBotIdSet(TBotIdSet&&) noexcept = default; + constexpr TBotIdSet& operator = (const TBotIdSet&) noexcept = default; + constexpr TBotIdSet& operator = (TBotIdSet&&) noexcept = default; + + TBotIdSet(std::initializer_list<ui32> botIds) { + for (auto id : botIds) { + insert(id); + } + } + + static TBotIdSet All() noexcept { + TBotIdSet res; + res.Bots.set(); + return res; + } + + constexpr bool contains(ui32 botId) const noexcept { + return (botId < Bots.size()) && Bots[botId]; + } + + bool insert(ui32 botId) noexcept { + if (botId >= Bots.size() || Bots[botId]) { + return false; + } + Bots[botId] = true; + return true; + } + + bool remove(ui32 botId) noexcept { + if (botId >= Bots.size() || !Bots[botId]) { + return false; + } + Bots[botId] = false; + return true; + } + + void clear() noexcept { + Bots.reset(); + } + + size_t size() const noexcept { + return Bots.count(); + } + + bool empty() const noexcept { + return Bots.none(); + } + + bool operator==(const TBotIdSet& rhs) const noexcept = default; + + TBotIdSet operator&(TBotIdSet rhs) const noexcept { + rhs.Bots &= Bots; + return rhs; + } + + TBotIdSet operator|(TBotIdSet rhs) const noexcept { + rhs.Bots |= Bots; + return rhs; + } + + TBotIdSet operator~() const noexcept { + TBotIdSet result; + result.Bots = ~Bots; + return result; + } + + class iterator + { + public: + auto operator * () const noexcept { + return BotId; + } + + iterator& operator ++ () noexcept { + while (BotId < Bots.size()) { + if (Bots[++BotId]) { + break; + } + } + return *this; + } + + bool operator == (const iterator& rhs) const noexcept { + return (&Bots == &rhs.Bots) && (BotId == rhs.BotId); + } + + bool operator != (const iterator& rhs) const noexcept { + return !(*this == rhs); + } + + private: + friend class TBotIdSet; + iterator(const TData& bots, ui32 botId) + : Bots(bots) + , BotId(botId) + { + while (BotId < Bots.size() && !Bots[BotId]) { + ++BotId; + } + } + + private: + const TData& Bots; + ui32 BotId; + }; + + iterator begin() const noexcept { + return {Bots, robotstxtcfg::id_anybot}; + } + + iterator end() const noexcept { + return {Bots, robotstxtcfg::max_botid}; + } + +private: + TData Bots {}; +}; diff --git a/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.cpp b/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.cpp new file mode 100644 index 0000000000..c5652b81c5 --- /dev/null +++ b/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.cpp @@ -0,0 +1,2 @@ +#include "robotstxtcfg.h" +// header compile test diff --git a/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.h b/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.h new file mode 100644 index 0000000000..2cf9430d7c --- /dev/null +++ b/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.h @@ -0,0 +1,11 @@ +#pragma once + +#include "bot_id_set.h" + + +namespace robotstxtcfg { + +static const TBotIdSet defaultSupportedBotIds = {id_defbot}; +static const TBotIdSet allSupportedBotIds = TBotIdSet::All(); + +} // namespace robotstxtcfg diff --git a/library/cpp/robots_txt/robotstxtcfg/user_agents.cpp b/library/cpp/robots_txt/robotstxtcfg/user_agents.cpp new file mode 100644 index 0000000000..60b353a427 --- /dev/null +++ b/library/cpp/robots_txt/robotstxtcfg/user_agents.cpp @@ -0,0 +1,2 @@ +#include "user_agents.h" +// header compile test diff --git a/library/cpp/robots_txt/robotstxtcfg/user_agents.h b/library/cpp/robots_txt/robotstxtcfg/user_agents.h new file mode 100644 index 0000000000..59245d07cb --- /dev/null +++ b/library/cpp/robots_txt/robotstxtcfg/user_agents.h @@ -0,0 +1,303 @@ +#pragma once + +#include <library/cpp/case_insensitive_string/case_insensitive_string.h> + + +namespace robotstxtcfg { + // robots.txt agents and identifiers + + enum EBots : ui32 { + id_anybot = 0, + id_yandexbot = 1, + id_yandexmediabot = 2, + id_yandeximagesbot = 3, + id_googlebot = 4, + id_yandexbotmirr = 5, + id_yahooslurp = 6, + id_msnbot = 7, + id_yandexcatalogbot = 8, + id_yandexdirectbot = 9, + id_yandexblogsbot = 10, + id_yandexnewsbot = 11, + id_yandexpagechk = 12, + id_yandexmetrikabot = 13, + id_yandexbrowser = 14, + id_yandexmarketbot = 15, + id_yandexcalendarbot = 16, + id_yandexwebmasterbot = 17, + id_yandexvideobot = 18, + id_yandeximageresizerbot = 19, + id_yandexadnetbot = 20, + id_yandexpartnerbot = 21, + id_yandexdirectdbot = 22, + id_yandextravelbot = 23, + id_yandexmobilebot = 24, + id_yandexrcabot = 25, + id_yandexdirectdynbot = 26, + id_yandexmobilebot_ed = 27, + id_yandexaccessibilitybot = 28, + id_baidubot = 29, + id_yandexscreenshotbot = 30, + id_yandexmetrikayabs = 31, + id_yandexvideoparserbot = 32, + id_yandexnewsbot4 = 33, + id_yandexmarketbot2 = 34, + id_yandexmedianabot = 35, + id_yandexsearchshopbot = 36, + id_yandexontodbbot = 37, + id_yandexontodbapibot = 38, + id_yandexampbot = 39, + id_yandexvideohosting = 40, + id_yandexmediaselling = 41, + id_yandexverticals = 42, + id_yandexturbobot = 43, + id_yandexzenbot = 44, + id_yandextrackerbot = 45, + id_yandexmetrikabot4 = 46, + id_yandexmobilescreenshotbot = 47, + id_yandexfaviconsbot = 48, + id_yandexrenderresourcesbot = 49, + id_yandexactivity = 50, + max_botid + }; + + static const ui32 id_defbot = id_yandexbot; + + struct TBotInfo { + TCaseInsensitiveStringBuf ReqPrefix; + TCaseInsensitiveStringBuf FullName; + TStringBuf FromField = {}; + TStringBuf UserAgent = {}; + TStringBuf RotorUserAgent = {}; + bool ExplicitDisallow = false; + }; + + static constexpr TStringBuf UserAgentFrom("support@search.yandex.ru"); + + static constexpr TBotInfo BotInfoArr[] = { + {"*", "*"}, + {"Yandex", "YandexBot/3.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + false}, + {"Yandex", "YandexMedia/3.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexMedia/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexMedia/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + false}, + {"Yandex", "YandexImages/3.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexImages/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexImages/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + false}, + {"Google", "GoogleBot"}, + {"Yandex", "YandexBot/3.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexBot/3.0; MirrorDetector; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexBot/3.0; MirrorDetector; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + false}, + {"Slurp", "Slurp"}, + {"msn", "msnbot"}, + {"Yandex", "YandexCatalog/3.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexCatalog/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexCatalog/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + false}, + {"YaDirectFetcher", "YaDirectFetcher/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YaDirectFetcher/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YaDirectFetcher/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + + {"Yandex", "YandexBlogs/0.99", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexBlogs/0.99; robot; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexBlogs/0.99; robot; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + false}, + {"Yandex", "YandexNews/3.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexNews/3.0; robot; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexNews/3.0; robot; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + false}, + {"Yandex", "YandexPagechecker/2.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexPagechecker/2.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexPagechecker/2.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + false}, + {"Yandex", "YandexMetrika/3.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexMetrika/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexMetrika/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + false}, + {"Yandex", "YandexBrowser/1.0", UserAgentFrom, + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/536.5 (KHTML, like Gecko) YaBrowser/1.0.1084.5402 Chrome/19.0.1084.5409 Safari/536.5", + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/536.5 (KHTML, like Gecko) YaBrowser/1.0.1084.5402 Chrome/19.0.1084.5409 Safari/536.5", + false}, + {"Yandex", "YandexMarket/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexMarket/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexMarket/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + false}, + {"YandexCalendar", "YandexCalendar/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexCalendar/1.0 +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexCalendar/1.0 +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + {"Yandex", "YandexWebmaster/2.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexWebmaster/2.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexWebmaster/2.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + false}, + {"Yandex", "YandexVideo/3.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexVideo/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexVideo/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + false}, + {"Yandex", "YandexImageResizer/2.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexImageResizer/2.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexImageResizer/2.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + false}, + + {"YandexDirect", "YandexDirect/3.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexDirect/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexDirect/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + {"YandexPartner", "YandexPartner/3.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexPartner/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexPartner/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + {"YaDirectFetcher", "YaDirectFetcher/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YaDirectFetcher/1.0; Dyatel; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YaDirectFetcher/1.0; Dyatel; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + {"Yandex", "YandexTravel/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexTravel/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexTravel/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + false}, + {"Yandex", "YandexBot/3.0", UserAgentFrom, + "Mozilla/5.0 (iPhone; CPU iPhone OS 8_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B411 Safari/600.1.4 (compatible; YandexBot/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (iPhone; CPU iPhone OS 8_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B411 Safari/600.1.4 (compatible; YandexBot/3.0; +http://yandex.com/bots)", + false}, + {"YandexRCA", "YandexRCA/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexRCA/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexRCA/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + {"YandexDirectDyn", "YandexDirectDyn/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexDirectDyn/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexDirectDyn/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + {"YandexMobileBot", "YandexMobileBot/3.0", UserAgentFrom, + "Mozilla/5.0 (iPhone; CPU iPhone OS 15_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Mobile/15E148 Safari/604.1 (compatible; YandexMobileBot/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (iPhone; CPU iPhone OS 15_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Mobile/15E148 Safari/604.1 (compatible; YandexMobileBot/3.0; +http://yandex.com/bots)", + true}, + {"YandexAccessibilityBot", "YandexAccessibilityBot/3.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexAccessibilityBot/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexAccessibilityBot/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + {"Baidu", "Baiduspider"}, + + {"YandexScreenshotBot", "YandexScreenshotBot/3.0", UserAgentFrom, + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 (compatible; YandexScreenshotBot/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 (compatible; YandexScreenshotBot/3.0; +http://yandex.com/bots)", + true}, + {"YandexMetrika", "YandexMetrika/2.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexMetrika/2.0; +http://yandex.com/bots yabs01)", + "Mozilla/5.0 (compatible; YandexMetrika/2.0; +http://yandex.com/bots yabs01) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + {"YandexVideoParser", "YandexVideoParser/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexVideoParser/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexVideoParser/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + {"Yandex", "YandexNews/4.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexNews/4.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexNews/4.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + {"YandexMarket", "YandexMarket/2.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexMarket/2.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexMarket/2.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + {"YandexMedianaBot", "YandexMedianaBot/1.0", UserAgentFrom, + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 (compatible; YandexMedianaBot/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 (compatible; YandexMedianaBot/1.0; +http://yandex.com/bots)", + true}, + {"YandexSearchShop", "YandexSearchShop/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexSearchShop/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexSearchShop/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + {"Yandex", "YandexOntoDB/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexOntoDB/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexOntoDB/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + false}, + {"YandexOntoDBAPI", "YandexOntoDBAPI/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexOntoDBAPI/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexOntoDBAPI/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + {"Yandex-AMPHTML", "Yandex-AMPHTML", UserAgentFrom, + "Mozilla/5.0 (compatible; Yandex-AMPHTML; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; Yandex-AMPHTML; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + + {"YandexVideoHosting", "YandexVideoHosting/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexVideoHosting/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexVideoHosting/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + {"YandexMediaSelling", "YandexMediaSelling/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexMediaSelling/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexMediaSelling/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + {"YandexVerticals", "YandexVerticals/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexVerticals/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexVerticals/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + {"YandexTurbo", "YandexTurbo/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexTurbo/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexTurbo/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + {"YandexZenRss", "YandexZenRss/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexZenRss/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexZenRss/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + {"YandexTracker", "YandexTracker/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexTracker/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexTracker/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + {"YandexMetrika", "YandexMetrika/4.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexMetrika/4.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexMetrika/4.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + {"YandexMobileScreenShotBot", "YandexMobileScreenShotBot/1.0", UserAgentFrom, + "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/11.0 Mobile/12B411 Safari/600.1.4 (compatible; YandexBot/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/11.0 Mobile/12B411 Safari/600.1.4 (compatible; YandexBot/3.0; +http://yandex.com/bots)", + true}, + {"YandexFavicons", "YandexFavicons/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexFavicons/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexFavicons/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + {"YandexRenderResourcesBot", "YandexRenderResourcesBot/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexRenderResourcesBot/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexRenderResourcesBot/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true}, + {"YandexActivity", "YandexActivity/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexActivity; robot; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexActivity; robot; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0", + true} + }; + + static_assert(std::size(BotInfoArr) == max_botid); + + constexpr auto GetReqPrefix(ui32 botId) { + return BotInfoArr[botId].ReqPrefix; + } + + constexpr auto GetFullName(ui32 botId) { + return BotInfoArr[botId].FullName; + } + + constexpr auto GetFromField(ui32 botId) { + return BotInfoArr[botId].FromField; + } + + constexpr auto GetUserAgent(ui32 botId) { + return BotInfoArr[botId].UserAgent; + } + + constexpr auto GetRotorUserAgent(ui32 botId) { + return BotInfoArr[botId].RotorUserAgent; + } + + constexpr bool IsExplicitDisallow(ui32 botId) { + return BotInfoArr[botId].ExplicitDisallow; + } + + constexpr bool IsYandexBotId(ui32 botId) { + return !BotInfoArr[botId].UserAgent.empty(); + } + +} // namespace robotstxtcfg diff --git a/library/cpp/robots_txt/robotstxtcfg/ya.make b/library/cpp/robots_txt/robotstxtcfg/ya.make new file mode 100644 index 0000000000..61c731be42 --- /dev/null +++ b/library/cpp/robots_txt/robotstxtcfg/ya.make @@ -0,0 +1,13 @@ +LIBRARY() + +SRCS( + bot_id_set.cpp + robotstxtcfg.cpp + user_agents.cpp +) + +PEERDIR( + library/cpp/case_insensitive_string +) + +END() diff --git a/library/cpp/robots_txt/rules_handler.cpp b/library/cpp/robots_txt/rules_handler.cpp new file mode 100644 index 0000000000..4297db9d21 --- /dev/null +++ b/library/cpp/robots_txt/rules_handler.cpp @@ -0,0 +1,514 @@ +#include "robots_txt.h" +#include "constants.h" + +#include <library/cpp/uri/http_url.h> +#include <library/cpp/charset/ci_string.h> +#include <library/cpp/string_utils/url/url.h> +#include <util/system/maxlen.h> +#include <util/generic/yexception.h> +#include <util/generic/algorithm.h> + + +namespace { + +TBotIdSet ConvertBotIdSet(const TSet<ui32>& botIds) noexcept { + TBotIdSet result; + for (auto id : botIds) { + result.insert(id); + } + return result; +} + +} // namespace + +TRobotsTxtRulesIterator::TRobotsTxtRulesIterator(const char* begin, const char* end) + : Begin(begin) + , End(end) +{ +} + +void TRobotsTxtRulesIterator::Next() { + while (Begin < End && *Begin) + ++Begin; + while (Begin < End && !isalpha(*Begin)) + ++Begin; +} + +bool TRobotsTxtRulesIterator::HasRule() const { + return Begin < End; +} + +const char* TRobotsTxtRulesIterator::GetRule() const { + return Begin + 1; +} + +TString TRobotsTxtRulesIterator::GetInitialRule() const { + auto begin = Begin + 1; + TStringBuf rule(begin, strlen(begin)); + + switch (*Begin) { + case 'a': + case 'd': + return rule.EndsWith('*') ? TString(rule.Chop(1)) : TString::Join(rule, '$'); + default: + return TString(rule); + } +} + +EDirectiveType TRobotsTxtRulesIterator::GetRuleType() const { + return CharToDirType(*Begin); +} + +EDirectiveType TRobotsTxtRulesIterator::CharToDirType(char ch) { + switch (toupper(ch)) { + case 'A': + return ALLOW; + case 'C': + return CRAWL_DELAY; + case 'D': + return DISALLOW; + case 'H': + return HOST; + case 'P': + return CLEAN_PARAM; + case 'S': + return SITEMAP; + } + return UNKNOWN; +} + +TRobotsTxtRulesHandlerBase::TRobotsTxtRulesHandlerBase( + TBotIdSet supportedBotIds, + int robotsMaxSize, + int maxRulesNumber, + bool saveDataForAnyBot) + : HandleErrors(false) + , SiteMaps() + , CleanParams() + , HostDirective("") + , Errors() + , AcceptedLines() + , CrossSectionAcceptedLines() + , BotIdToInfo(robotstxtcfg::max_botid) + , RobotsMaxSize(robotsMaxSize) + , MaxRulesNumber(maxRulesNumber) + , SaveDataForAnyBot(saveDataForAnyBot) + , SupportedBotIds(supportedBotIds) +{ + Y_ENSURE(!supportedBotIds.empty()); + + if (RobotsMaxSize <= 0) + RobotsMaxSize = robots_max; + if (MaxRulesNumber <= 0) + MaxRulesNumber = max_rules_count; + + ResetOptimized(); +} + +TRobotsTxtRulesHandlerBase::TRobotsTxtRulesHandlerBase( + const TSet<ui32>& supportedBotIds, + int robotsMaxSize, + int maxRulesNumber, + bool saveDataForAnyBot) + : TRobotsTxtRulesHandlerBase(ConvertBotIdSet(supportedBotIds), robotsMaxSize, maxRulesNumber, saveDataForAnyBot) +{} + +TRobotsTxtRulesHandlerBase::~TRobotsTxtRulesHandlerBase() = default; + +void TRobotsTxtRulesHandlerBase::CheckBotIdValidity(const ui32 botId) const { + if (botId >= robotstxtcfg::max_botid || !IsBotIdSupported(botId)) + ythrow yexception() << "robots.txt parser requested for invalid or unsupported botId = " << botId << Endl; + ; +} + +int TRobotsTxtRulesHandlerBase::GetCrawlDelay(const ui32 botId, bool* realInfo) const { + const auto id = GetMappedBotId(botId, false); + if (realInfo) + *realInfo = bool(id); + return BotIdToInfo[id.GetOrElse(robotstxtcfg::id_anybot)].CrawlDelay; +} + +int TRobotsTxtRulesHandlerBase::GetMinCrawlDelay(int defaultCrawlDelay) const { + int res = INT_MAX; + bool useDefault = false; + for (ui32 botId = 0; botId < robotstxtcfg::max_botid; ++botId) { + if (robotstxtcfg::IsYandexBotId(botId) && IsBotIdSupported(botId) && !IsDisallowAll(botId)) { + bool realInfo; + int curCrawlDelay = GetCrawlDelay(botId, &realInfo); + if (realInfo) { + if (curCrawlDelay == -1) { + useDefault = true; + } else { + res = Min(res, curCrawlDelay); + } + } + } + } + + if (useDefault && defaultCrawlDelay < res) { + return -1; + } + + if (res == INT_MAX) { + res = GetCrawlDelay(robotstxtcfg::id_anybot); + } + + return res; +} + +void TRobotsTxtRulesHandlerBase::SetCrawlDelay(const ui32 botId, int crawlDelay) { + CheckBotIdValidity(botId); + BotIdToInfo[botId].CrawlDelay = crawlDelay; +} + +const TVector<TString> TRobotsTxtRulesHandlerBase::GetSiteMaps() const { + return TVector<TString>(SiteMaps.begin(), SiteMaps.end()); +} + +void TRobotsTxtRulesHandlerBase::AddSiteMap(const char* sitemap) { + SiteMaps.insert(sitemap); +} + +const TVector<TString> TRobotsTxtRulesHandlerBase::GetCleanParams() const { + return TVector<TString>(CleanParams.begin(), CleanParams.end()); +} + +void TRobotsTxtRulesHandlerBase::AddCleanParam(const char* cleanParam) { + CleanParams.insert(cleanParam); +} + +const TString& TRobotsTxtRulesHandlerBase::GetHostDirective() const { + return HostDirective; +} + +void TRobotsTxtRulesHandlerBase::SetHostDirective(const char* hostDirective) { + HostDirective = hostDirective; +} + +const TRobotsTxtRulesHandlerBase::TErrorVector& TRobotsTxtRulesHandlerBase::GetErrors() const { + return Errors; +} + +TVector<int> TRobotsTxtRulesHandlerBase::GetAcceptedLines(const ui32 botId) const { + TVector<int> ret; + for (size_t i = 0; i < CrossSectionAcceptedLines.size(); ++i) + ret.push_back(CrossSectionAcceptedLines[i]); + + bool hasLinesForBotId = false; + for (size_t i = 0; i < AcceptedLines.size(); ++i) { + if (AcceptedLines[i].first == botId) { + hasLinesForBotId = true; + break; + } + } + + for (size_t i = 0; i < AcceptedLines.size(); ++i) { + if (hasLinesForBotId && AcceptedLines[i].first == botId) { + ret.push_back(AcceptedLines[i].second); + } else if (!hasLinesForBotId && AcceptedLines[i].first == robotstxtcfg::id_anybot) { + ret.push_back(AcceptedLines[i].second); + } + } + + Sort(ret.begin(), ret.end()); + + return ret; +} + +void TRobotsTxtRulesHandlerBase::AddAcceptedLine(ui32 line, const TBotIdSet& botIds, bool isCrossSection) { + if (isCrossSection) { + CrossSectionAcceptedLines.push_back(line); + return; + } + + for (auto botId : botIds) { + AcceptedLines.push_back(TBotIdAcceptedLine(botId, line)); + } +} + +void TRobotsTxtRulesHandlerBase::SetErrorsHandling(bool handleErrors) { + HandleErrors = handleErrors; +} + +bool TRobotsTxtRulesHandlerBase::IsHandlingErrors() const { + return HandleErrors; +} + +EDirectiveType TRobotsTxtRulesHandlerBase::NameToDirType(const char* d) { + if (!strcmp("disallow", d)) + return DISALLOW; + if (!strcmp("allow", d)) + return ALLOW; + if (!strcmp("user-agent", d)) + return USER_AGENT; + if (!strcmp("host", d)) + return HOST; + if (!strcmp("sitemap", d)) + return SITEMAP; + if (!strcmp("clean-param", d)) + return CLEAN_PARAM; + if (!strcmp("crawl-delay", d)) + return CRAWL_DELAY; + return UNKNOWN; +} + +const char* TRobotsTxtRulesHandlerBase::DirTypeToName(EDirectiveType t) { + static const char* name[] = {"Allow", "Crawl-Delay", "Disallow", "Host", "Clean-Param", "Sitemap", "User-Agent", "Unknown"}; + switch (t) { + case ALLOW: + return name[0]; + case CRAWL_DELAY: + return name[1]; + case DISALLOW: + return name[2]; + case HOST: + return name[3]; + case CLEAN_PARAM: + return name[4]; + case SITEMAP: + return name[5]; + case USER_AGENT: + return name[6]; + case UNKNOWN: + return name[7]; + } + return name[7]; +} + +bool TRobotsTxtRulesHandlerBase::CheckRobot( + const char* userAgent, + TBotIdSet& botIds, + const TVector<ui32>* botIdToMaxAppropriateUserAgentNameLength) const +{ + TCaseInsensitiveStringBuf agent(userAgent); + + for (size_t botIndex = 0; botIndex < robotstxtcfg::max_botid; ++botIndex) { + if (!IsBotIdSupported(botIndex)) + continue; + + bool hasRequiredAgentNamePrefix = agent.StartsWith(robotstxtcfg::GetReqPrefix(botIndex)); + bool isContainedInFullName = robotstxtcfg::GetFullName(botIndex).StartsWith(agent); + bool wasMoreImportantAgent = false; + if (botIdToMaxAppropriateUserAgentNameLength) + wasMoreImportantAgent = agent.size() < (*botIdToMaxAppropriateUserAgentNameLength)[botIndex]; + + if (hasRequiredAgentNamePrefix && isContainedInFullName && !wasMoreImportantAgent) { + botIds.insert(botIndex); + } + } + + return !botIds.empty(); +} + +int TRobotsTxtRulesHandlerBase::CheckRule(const char* value, int line, TRobotsTxtRulesHandlerBase* rulesHandler) { + if (!rulesHandler->IsHandlingErrors()) + return 0; + + if (auto len = strlen(value); len > max_rule_length) { + rulesHandler->AddError(ERROR_RULE_HUGE, line); + } + + bool upper = false, suspect = false; + for (const char* r = value; *r; ++r) { + if (!upper && isupper(*r)) + upper = true; + if (!suspect && !isalnum(*r) && !strchr("/_?=.-*%&~[]:;@", *r) && (*(r + 1) || *r != '$')) + suspect = true; + } + if (suspect) + rulesHandler->AddError(WARNING_SUSPECT_SYMBOL, line); + if (upper) + rulesHandler->AddError(WARNING_UPPER_REGISTER, line); + return suspect || upper; +} + +void TRobotsTxtRulesHandlerBase::AddError(EFormatErrorType type, int line) { + if (!HandleErrors) + return; + Errors.push_back(std::make_pair(type, line)); +} + +void TRobotsTxtRulesHandlerBase::ResetOptimized() noexcept { + for (ui32 i = 0; i < OptimizedBotIdToStoredBotId.size(); ++i) { + OptimizedBotIdToStoredBotId[i] = i; // by default, every bot maps to itself + } +} + +void TRobotsTxtRulesHandlerBase::Clear() { + SiteMaps.clear(); + CleanParams.clear(); + HostDirective = ""; + if (HandleErrors) { + AcceptedLines.clear(); + CrossSectionAcceptedLines.clear(); + Errors.clear(); + } + + for (size_t botId = 0; botId < BotIdToInfo.size(); ++botId) { + BotIdToInfo[botId].CrawlDelay = -1; + } + + LoadedBotIds.clear(); +} + +void TRobotsTxtRulesHandlerBase::ClearInternal(const ui32 botId) { + CheckBotIdValidity(botId); + BotIdToInfo[botId].CrawlDelay = -1; + + TVector<TBotIdAcceptedLine> newAcceptedLines; + for (size_t i = 0; i < AcceptedLines.size(); ++i) + if (AcceptedLines[i].first != botId) + newAcceptedLines.push_back(AcceptedLines[i]); + + AcceptedLines.swap(newAcceptedLines); +} + +int TRobotsTxtRulesHandlerBase::CheckHost(const char* host) { + THttpURL parsed; + TString copyHost = host; + + if (GetHttpPrefixSize(copyHost) == 0) { + copyHost = TString("http://") + copyHost; + } + + return parsed.Parse(copyHost.data(), THttpURL::FeaturesRobot) == THttpURL::ParsedOK && parsed.GetField(THttpURL::FieldHost) != TString(""); +} + +int TRobotsTxtRulesHandlerBase::CheckSitemapUrl(const char* url, const char* host, TString& modifiedUrl) { + if (host != nullptr && strlen(url) > 0 && url[0] == '/') { + modifiedUrl = TString(host) + url; + } else { + modifiedUrl = url; + } + + url = modifiedUrl.data(); + + if (strlen(url) >= URL_MAX - 8) + return 0; + THttpURL parsed; + if (parsed.Parse(url, THttpURL::FeaturesRobot) || !parsed.IsValidAbs()) + return 0; + if (parsed.GetScheme() != THttpURL::SchemeHTTP && parsed.GetScheme() != THttpURL::SchemeHTTPS) + return 0; + return CheckHost(parsed.PrintS(THttpURL::FlagHostPort).data()); +} + +// s - is space separated pair of clean-params (separated by &) and path prefix +int TRobotsTxtRulesHandlerBase::CheckAndNormCleanParam(TString& value) { + if (value.find(' ') == TString::npos) { + value.push_back(' '); + } + + const char* s = value.data(); + if (!s || !*s || strlen(s) > URL_MAX / 2 - 9) + return 0; + const char* p = s; + while (*p && !isspace(*p)) + ++p; + for (; s != p; ++s) { + // allowed only following not alpha-numerical symbols + if (!isalnum(*s) && !strchr("+-=_&%[]{}():.", *s)) + return 0; + // clean-params for prefix can be enumerated by & symbol, && not allowed syntax + if (*s == '&' && *(s + 1) == '&') + return 0; + } + const char* pathPrefix = p + 1; + while (isspace(*p)) + ++p; + char r[URL_MAX]; + char* pr = r; + for (; *p; ++p) { + if (!isalnum(*p) && !strchr(".-/*_,;:%", *p)) + return 0; + if (*p == '*') + *pr++ = '.'; + if (*p == '.') + *pr++ = '\\'; + *pr++ = *p; + } + *pr++ = '.'; + *pr++ = '*'; + *pr = 0; + TString params = value.substr(0, pathPrefix - value.data()); + value = params + r; + return 1; +} + +int TRobotsTxtRulesHandlerBase::ParseCrawlDelay(const char* value, int& crawlDelay) { + static const int MAX_CRAWL_DELAY = 1 << 10; + int val = 0; + const char* p = value; + for (; isdigit(*p); ++p) { + val = val * 10 + *p - '0'; + if (val > MAX_CRAWL_DELAY) + return 0; + } + if (*p) { + if (*p++ != '.') + return 0; + if (strspn(p, "1234567890") != strlen(p)) + return 0; + } + for (const char* s = p; s - p < 3; ++s) + val = val * 10 + (s < p + strlen(p) ? *s - '0' : 0); + crawlDelay = val; + return 1; +} + +bool TRobotsTxtRulesHandlerBase::AddRuleWithErrorCheck(const ui32 botId, TStringBuf rule, char type, TRobotsTxtParser& parser) { + if (!IsBotIdSupported(botId)) + return true; + + if (!AddRule(botId, rule, type)) { + AddError(ERROR_ROBOTS_HUGE, parser.GetLineNumber()); + AfterParse(botId); + return false; + } + return true; +} + +int TRobotsTxtRulesHandlerBase::OnHost(const ui32 botId, TRobotsTxtParser& parser, const char* value, TRobotsTxtRulesHandlerBase*& rulesHandler) { + // Temporary hack for correct repacking robots.txt from new format to old + // Remove it, when robot-stable-2010-10-17 will be deployed in production + if (!IsBotIdSupported(botId)) + return 0; + // end of hack + + if (rulesHandler->HostDirective != "") + rulesHandler->AddError(ERROR_HOST_MULTI, parser.GetLineNumber()); + else { + if (!CheckHost(value)) + rulesHandler->AddError(ERROR_HOST_FORMAT, parser.GetLineNumber()); + else { + rulesHandler->SetHostDirective(value); + if (!rulesHandler->AddRuleWithErrorCheck(botId, value, 'H', parser)) + return 2; + } + } + return 0; +} + +bool TRobotsTxtRulesHandlerBase::IsBotIdLoaded(const ui32 botId) const { + return LoadedBotIds.contains(botId); +} + +bool TRobotsTxtRulesHandlerBase::IsBotIdSupported(const ui32 botId) const { + return (SaveDataForAnyBot && botId == robotstxtcfg::id_anybot) || SupportedBotIds.contains(botId); +} + +ui32 TRobotsTxtRulesHandlerBase::GetNotOptimizedBotId(const ui32 botId) const { + return (botId < OptimizedBotIdToStoredBotId.size()) + ? OptimizedBotIdToStoredBotId[botId] + : botId; +} + +TMaybe<ui32> TRobotsTxtRulesHandlerBase::GetMappedBotId(ui32 botId, bool useAny) const { + botId = GetNotOptimizedBotId(botId); + CheckBotIdValidity(botId); + if (IsBotIdLoaded(botId)) + return botId; + if (useAny) + return robotstxtcfg::id_anybot; + return {}; +} diff --git a/library/cpp/robots_txt/ya.make b/library/cpp/robots_txt/ya.make new file mode 100644 index 0000000000..c12b57ea04 --- /dev/null +++ b/library/cpp/robots_txt/ya.make @@ -0,0 +1,18 @@ +LIBRARY() + +SRCS( + prefix_tree.cpp + prefix_tree_rules_handler.cpp + robots_txt_parser.cpp + rules_handler.cpp +) + +PEERDIR( + library/cpp/robots_txt/robotstxtcfg + library/cpp/case_insensitive_string + library/cpp/charset + library/cpp/string_utils/url + library/cpp/uri +) + +END() diff --git a/library/cpp/yconf/CMakeLists.darwin-x86_64.txt b/library/cpp/yconf/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..4e5bbf836d --- /dev/null +++ b/library/cpp/yconf/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,19 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(library-cpp-yconf) +target_link_libraries(library-cpp-yconf PUBLIC + contrib-libs-cxxsupp + yutil + library-cpp-charset + library-cpp-logger +) +target_sources(library-cpp-yconf PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/yconf/conf.cpp +) diff --git a/library/cpp/yconf/CMakeLists.linux-aarch64.txt b/library/cpp/yconf/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..8ddf881133 --- /dev/null +++ b/library/cpp/yconf/CMakeLists.linux-aarch64.txt @@ -0,0 +1,20 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(library-cpp-yconf) +target_link_libraries(library-cpp-yconf PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + library-cpp-charset + library-cpp-logger +) +target_sources(library-cpp-yconf PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/yconf/conf.cpp +) diff --git a/library/cpp/yconf/CMakeLists.linux-x86_64.txt b/library/cpp/yconf/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..8ddf881133 --- /dev/null +++ b/library/cpp/yconf/CMakeLists.linux-x86_64.txt @@ -0,0 +1,20 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(library-cpp-yconf) +target_link_libraries(library-cpp-yconf PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + library-cpp-charset + library-cpp-logger +) +target_sources(library-cpp-yconf PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/yconf/conf.cpp +) diff --git a/library/cpp/yconf/CMakeLists.txt b/library/cpp/yconf/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/library/cpp/yconf/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/library/cpp/yconf/CMakeLists.windows-x86_64.txt b/library/cpp/yconf/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..4e5bbf836d --- /dev/null +++ b/library/cpp/yconf/CMakeLists.windows-x86_64.txt @@ -0,0 +1,19 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(library-cpp-yconf) +target_link_libraries(library-cpp-yconf PUBLIC + contrib-libs-cxxsupp + yutil + library-cpp-charset + library-cpp-logger +) +target_sources(library-cpp-yconf PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/yconf/conf.cpp +) |